clone-alert 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -16
- package/dist/baseline.d.ts +20 -0
- package/dist/baseline.js +105 -0
- package/dist/cli.d.ts +8 -2
- package/dist/cli.js +369 -68
- package/dist/core.d.ts +2 -0
- package/dist/core.js +19 -12
- package/dist/files.d.ts +2 -0
- package/dist/files.js +236 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.js +49 -0
- package/dist/stats.d.ts +28 -0
- package/dist/stats.js +37 -0
- package/package.json +3 -3
- package/scripts/compare-pmd-cpd.mjs +0 -565
package/README.md
CHANGED
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
[](./LICENSE)
|
|
7
7
|
[](https://nodejs.org)
|
|
8
8
|
[](https://www.npmjs.com/package/clone-alert)
|
|
9
|
+
[](#duplication-badge)
|
|
9
10
|
|
|
10
11
|
**clone-alert** finds duplicated and copy‑pasted code across your codebase by comparing **token streams** — the same proven approach as [PMD CPD](https://pmd.github.io/) (Copy‑Paste Detector), but built natively for the JavaScript/TypeScript ecosystem and your frontend templates. Catch code clones, enforce **DRY**, reduce technical debt, and fail your build when duplication creeps in.
|
|
11
12
|
|
|
@@ -18,11 +19,12 @@ npx clone-alert --minimum-tokens 50 --files src
|
|
|
18
19
|
## Why clone-alert?
|
|
19
20
|
|
|
20
21
|
- 🎯 **PMD CPD‑compatible** — a faithful port of PMD's match algorithm and JavaScript/TypeScript tokenizers, validated against PMD's own golden fixtures.
|
|
21
|
-
- ⚡ **Fast on large monorepos** — a struct‑of‑arrays token core with a Karp–Rabin rolling hash and radix‑sorted buckets. In our [benchmarks](#benchmarks) it runs **10–
|
|
22
|
+
- ⚡ **Fast on large monorepos** — a struct‑of‑arrays token core with a Karp–Rabin rolling hash and radix‑sorted buckets. In our [benchmarks](#benchmarks) it runs **10–27× faster** than PMD CPD while using **1.3–2.6× less memory**, on real codebases from Next.js to nx.
|
|
22
23
|
- 🧩 **Frontend templates, natively** — tokenizes **Vue** `<template>`, **Svelte** markup, and **Angular** templates, not just `<script>` blocks. Detects template‑to‑script duplication too.
|
|
23
24
|
- 🧪 **Zero‑config CLI** — sensible defaults, recursive directory scan, `node_modules`/`.git`/`dist` skipped automatically.
|
|
24
25
|
- 📦 **Tiny footprint** — a single runtime dependency (`typescript`). Framework parsers are **optional peer dependencies**, loaded only when needed.
|
|
25
|
-
- 🛠 **CI‑ready** — `text`, `json`,
|
|
26
|
+
- 🛠 **CI‑ready** — `text`, `json`, PMD‑style `xml` / `csv`, and **SARIF** (GitHub Code Scanning) reports; fails the build on duplication by default (exit code `4`), like PMD CPD.
|
|
27
|
+
- 📉 **Baseline for adoption** — accept the clones an existing project already has and fail CI only on **new** ones. Fingerprints are content‑based, so the baseline survives code moving around.
|
|
26
28
|
- 🔇 **Inline suppression** — ignore known duplication with `CPD-OFF` / `CPD-ON` comment markers.
|
|
27
29
|
|
|
28
30
|
## Supported languages & frameworks
|
|
@@ -59,14 +61,19 @@ Requires **Node.js 18+**.
|
|
|
59
61
|
## Quick start
|
|
60
62
|
|
|
61
63
|
```sh
|
|
62
|
-
# Scan a folder and print a human‑readable report
|
|
64
|
+
# Scan a folder and print a human‑readable report.
|
|
65
|
+
# Like PMD CPD, this exits 4 when duplication is found — so it fails CI out of the box.
|
|
63
66
|
clone-alert --minimum-tokens 50 --files src
|
|
64
67
|
|
|
65
|
-
#
|
|
66
|
-
clone-alert --minimum-tokens 50 --files src --fail-on-violation
|
|
68
|
+
# Just want the report, never a failing exit code? Opt out:
|
|
69
|
+
clone-alert --minimum-tokens 50 --files src --no-fail-on-violation
|
|
67
70
|
|
|
68
|
-
# Machine‑readable output for dashboards
|
|
69
|
-
clone-alert --format json --files src,packages > duplication.json
|
|
71
|
+
# Machine‑readable output for dashboards (don't fail the job that builds the artifact)
|
|
72
|
+
clone-alert --format json --files src,packages --no-fail-on-violation > duplication.json
|
|
73
|
+
|
|
74
|
+
# Adopt an existing project: accept today's clones, fail only on new ones
|
|
75
|
+
clone-alert --files src --baseline .clone-alert-baseline.json --update-baseline
|
|
76
|
+
clone-alert --files src --baseline .clone-alert-baseline.json --fail-on-violation
|
|
70
77
|
```
|
|
71
78
|
|
|
72
79
|
## Usage
|
|
@@ -80,11 +87,16 @@ clone-alert [options] [<path>...]
|
|
|
80
87
|
| Option | Description |
|
|
81
88
|
| --- | --- |
|
|
82
89
|
| `--files <path[,path...]>` | Files or directories to scan. Can be repeated. |
|
|
90
|
+
| `--file-list <path>` | Read newline-separated paths to scan from a file. |
|
|
83
91
|
| `--minimum-tokens <n>` | Minimum duplicated token span. Default: `50`. |
|
|
84
92
|
| `--minimum-tile-size <n>` | Alias for `--minimum-tokens`. |
|
|
85
|
-
| `--format <text
|
|
93
|
+
| `--format <fmt>` | `text` (default), `xml`, `json`, `sarif`, `csv`, `csv_with_linecount_per_file`, `markdown`, `ai`. `sarif` targets GitHub Code Scanning; the two `csv` formats mirror PMD's CSV renderers. `xml`/`json`/`markdown` embed the duplicated code (PMD's `<codefragment>`, a jscpd-style `fragment` field, and a fenced code block respectively). `ai` is a compact, token-frugal listing for LLM pipelines. `shields` prints a [shields.io endpoint](#duplication-badge) JSON for a duplication badge. `text` and `ai` end with a `N clones · X% duplicated lines` summary. |
|
|
86
94
|
| `--extensions <ext[,ext...]>` | Extensions to include during recursive scans. |
|
|
87
|
-
| `--exclude <glob[,glob...]>` | Exclude files or directories (glob). Can be repeated. |
|
|
95
|
+
| `--exclude <glob[,glob...]>` | Exclude files or directories (glob). Can be repeated. Prunes the walk, not a post-filter — excluded directories are never read. |
|
|
96
|
+
| `--non-recursive` | Scan only the top level of each directory. |
|
|
97
|
+
| `--gitignore` / `--no-gitignore` | Skip files ignored by `.gitignore` (nested files and the repo-root file honored). On by default. |
|
|
98
|
+
| `--skip-duplicate-files` | Skip files with the same name and byte length (PMD parity). |
|
|
99
|
+
| `--skip-lexical-errors` | Skip files that fail to tokenize instead of aborting the whole run. |
|
|
88
100
|
| `--ignore-identifiers` / `--no-ignore-identifiers` | Normalize or compare identifier names. Strict by default, like PMD. |
|
|
89
101
|
| `--ignore-literals` / `--no-ignore-literals` | Normalize or compare literals. Strict by default, like PMD. |
|
|
90
102
|
| `--pmd-typescript-compatibility` / `--no-…` | Match PMD `typescript` granularity for `.ts/.tsx` (split template literals into atoms, collapse regexp). On by default. |
|
|
@@ -92,7 +104,9 @@ clone-alert [options] [<path>...]
|
|
|
92
104
|
| `--vue-templates` / `--no-vue-templates` | Tokenize `.vue` markup, not just `<script>`. On by default. |
|
|
93
105
|
| `--angular-inline-templates` | Also scan Angular `@Component` inline templates. |
|
|
94
106
|
| `--skip-angular-inline-templates` | Do not scan inline Angular templates (explicit default). |
|
|
95
|
-
| `--fail-on-violation` | Exit with code `4` when duplications are found. |
|
|
107
|
+
| `--fail-on-violation` / `--no-fail-on-violation` | Exit with code `4` when duplications are found. **On by default**, like PMD CPD; pass `--no-fail-on-violation` to always exit `0`. |
|
|
108
|
+
| `--baseline <path>` | Ignore duplications recorded in this baseline file; report and fail only on **new** ones. Matched by content fingerprint, so accepted clones stay suppressed even after the code moves. |
|
|
109
|
+
| `--update-baseline` | Write/regenerate the baseline file at `--baseline` with all current duplications, then exit `0`. Run once to adopt existing debt. |
|
|
96
110
|
| `-h, --help` | Show help. |
|
|
97
111
|
| `-V, --version` | Show version. |
|
|
98
112
|
|
|
@@ -159,6 +173,94 @@ const generatedTableB = { /* ... */ };
|
|
|
159
173
|
// CPD-ON
|
|
160
174
|
```
|
|
161
175
|
|
|
176
|
+
## Baseline (adopting an existing project)
|
|
177
|
+
|
|
178
|
+
A fresh project can have thousands of pre‑existing clones — enough to light up CI red on day one. A **baseline** lets you accept that debt and gate only on what's added afterwards.
|
|
179
|
+
|
|
180
|
+
Generate it once, commit it, then check against it in CI:
|
|
181
|
+
|
|
182
|
+
```sh
|
|
183
|
+
# 1. Record today's duplications (writes the file, exits 0)
|
|
184
|
+
clone-alert --files src --baseline .clone-alert-baseline.json --update-baseline
|
|
185
|
+
|
|
186
|
+
# 2. In CI: fail only on clones not in the baseline
|
|
187
|
+
clone-alert --files src --baseline .clone-alert-baseline.json --fail-on-violation
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
The baseline is a small, sorted JSON file you commit and review in pull requests:
|
|
191
|
+
|
|
192
|
+
```json
|
|
193
|
+
{
|
|
194
|
+
"version": 1,
|
|
195
|
+
"clones": [
|
|
196
|
+
{
|
|
197
|
+
"fingerprint": "00a034a93cd6e7e3",
|
|
198
|
+
"tokens": 414,
|
|
199
|
+
"files": ["src/server/webkit/webview/wvPage.ts", "src/server/webkit/wkPage.ts"]
|
|
200
|
+
}
|
|
201
|
+
]
|
|
202
|
+
}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Each clone is matched by a **content fingerprint** hashed over its tokens only — no line numbers, no file paths. So a baselined clone stays suppressed when the code is moved, reformatted, or shifted by edits above it, and the file produces a stable, churn‑free diff. Introduce a genuinely new duplication and CI fails on that one alone. Re‑run `--update-baseline` to re‑adopt after an intentional change.
|
|
206
|
+
|
|
207
|
+
> The baseline filters the already‑computed match set, so it adds no measurable cost to a scan — there's no separate cache to persist between CI runs.
|
|
208
|
+
|
|
209
|
+
## GitHub Code Scanning (SARIF)
|
|
210
|
+
|
|
211
|
+
`--format sarif` emits a SARIF 2.1.0 log that GitHub ingests as code‑scanning alerts, shown inline in pull requests and in the repository's Security tab. Each duplication's stable content fingerprint is written to `partialFingerprints`, so GitHub tracks an alert across commits and **does not re‑raise it when the clone simply moves**. Artifact URIs are relative to the working directory, so they map onto the checked‑out tree.
|
|
212
|
+
|
|
213
|
+
```yaml
|
|
214
|
+
# .github/workflows/clone-alert.yml
|
|
215
|
+
name: clone-alert
|
|
216
|
+
on: [push, pull_request]
|
|
217
|
+
jobs:
|
|
218
|
+
duplication:
|
|
219
|
+
runs-on: ubuntu-latest
|
|
220
|
+
permissions:
|
|
221
|
+
security-events: write # required to upload SARIF
|
|
222
|
+
steps:
|
|
223
|
+
- uses: actions/checkout@v4
|
|
224
|
+
- uses: actions/setup-node@v4
|
|
225
|
+
with: { node-version: 20 }
|
|
226
|
+
# --no-fail-on-violation so the step exits 0 and the SARIF still uploads;
|
|
227
|
+
# GitHub surfaces the duplications as code-scanning alerts instead.
|
|
228
|
+
- run: npx clone-alert src --format sarif --no-fail-on-violation > clone-alert.sarif
|
|
229
|
+
- uses: github/codeql-action/upload-sarif@v3
|
|
230
|
+
with:
|
|
231
|
+
sarif_file: clone-alert.sarif
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Combine it with a committed `--baseline` to surface only the duplications added after adoption.
|
|
235
|
+
|
|
236
|
+
## Duplication badge
|
|
237
|
+
|
|
238
|
+
Show off how clean your codebase is with a [shields.io](https://shields.io/badges/endpoint-badge) badge. `--format shields` prints a shields **endpoint JSON** to stdout — host it (a committed file, a gist, anywhere reachable) and point shields at it:
|
|
239
|
+
|
|
240
|
+
```sh
|
|
241
|
+
clone-alert src --minimum-tokens 70 --format shields --no-fail-on-violation > clone-alert-badge.json
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
```md
|
|
245
|
+
[](https://github.com/BaryshevRS/clone-alert)
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
shields fetches the JSON and renders the badge, so it refreshes whenever you regenerate the file. The color comes from a fixed scale, tuned to reward near‑zero duplication:
|
|
249
|
+
|
|
250
|
+
| Result | Color | |
|
|
251
|
+
| --- | --- | --- |
|
|
252
|
+
| **0 clones** | 🟢 bright green | the flex — zero copy‑paste |
|
|
253
|
+
| **≤ 3%** | 🟢 green | clean |
|
|
254
|
+
| **≤ 10%** | 🟡 yellow | has some debt |
|
|
255
|
+
| **> 10%** | 🔴 red | needs attention |
|
|
256
|
+
|
|
257
|
+
The percentage is `duplicated lines / total scanned lines`, so it tracks your chosen `--minimum-tokens` (and which files you scan — exclude `**/*.test.*` and fixtures to badge production code only). Regenerate it in CI to keep it fresh:
|
|
258
|
+
|
|
259
|
+
```yaml
|
|
260
|
+
- run: npx clone-alert src --minimum-tokens 70 --format shields --no-fail-on-violation > clone-alert-badge.json
|
|
261
|
+
# then commit the file (or push it to a gist) so shields serves the latest value
|
|
262
|
+
```
|
|
263
|
+
|
|
162
264
|
## Programmatic API
|
|
163
265
|
|
|
164
266
|
clone-alert ships with TypeScript types and a small Node API:
|
|
@@ -184,17 +286,17 @@ Framework template tokens live in a separate namespace from script tokens, so ma
|
|
|
184
286
|
|
|
185
287
|
## Benchmarks
|
|
186
288
|
|
|
187
|
-
clone-alert is a drop-in for PMD CPD that runs **10–
|
|
289
|
+
clone-alert is a drop-in for PMD CPD that runs **10–27× faster** on **1.3–2.6× less memory** — on the same files, finding the same clones.
|
|
188
290
|
|
|
189
291
|
Measured with [`npm run compare:pmd`](#development) on five real-world TypeScript codebases. Only pure `.ts` is compared (the exact file set PMD's `typescript` lexer can parse), so all tools see byte-identical input. macOS, Node 20, `--minimum-tokens 50`, JVM start-up counted for PMD as in real CLI use:
|
|
190
292
|
|
|
191
293
|
| Repository | clone-alert | PMD CPD | Speed‑up | Peak RAM (clone vs PMD) | Agreement with PMD¹ |
|
|
192
294
|
| --- | --- | --- | --- | --- | --- |
|
|
193
|
-
| `nestjs/nest` | **
|
|
194
|
-
| `angular/components` | **1.
|
|
195
|
-
| `microsoft/playwright` | **
|
|
196
|
-
| `vercel/next.js` | **
|
|
197
|
-
| `nrwl/nx` | **8.
|
|
295
|
+
| `nestjs/nest` | **0.7 s** | 15.4 s | **23×** | 203 MB vs 526 MB (2.6× less) | 100% |
|
|
296
|
+
| `angular/components` | **1.6 s** | 41.9 s | **27×** | 338 MB vs 632 MB (1.9× less) | 95%² |
|
|
297
|
+
| `microsoft/playwright` | **3.6 s** | 58.7 s | **16×** | 836 MB vs 1.6 GB (1.9× less) | 99.98% |
|
|
298
|
+
| `vercel/next.js` | **6.0 s** | 73.6 s | **12×** | 1.3 GB vs 1.7 GB (1.3× less) | 99.2% |
|
|
299
|
+
| `nrwl/nx` | **8.1 s** | 83.2 s | **10×** | 2.1 GB vs 3.2 GB (1.5× less) | 99.9% |
|
|
198
300
|
|
|
199
301
|
<sub>¹ Jaccard overlap of the file pairs both tools flag as duplicated. ² `angular/components` ships ~20 near‑identical table demos sharing the same 398‑token block. clone-alert and PMD cut that clone's boundary **identically** (398, 391, 390, 210… tokens, token‑for‑token); they only disagree on *which* of the interchangeable demo files get grouped into the same `<duplication>` — a symmetric clustering tie‑break, not missed or mis‑sized duplication. On this small sample (~2 000 file pairs) that grouping noise is the whole 5%.</sub>
|
|
200
302
|
|
|
@@ -220,8 +322,15 @@ Because the tokens are identical and the match engine is a faithful port of PMD'
|
|
|
220
322
|
| Svelte markup | ✅ (Svelte 5+) | ➖ | ➖ |
|
|
221
323
|
| Angular templates | ✅ | ➖ | flat HTML only |
|
|
222
324
|
| PMD CPD algorithm parity | ✅ | — | ➖ |
|
|
325
|
+
| CI baseline (fail only on new) | ✅ committed fingerprint file | ➖ | ⚠️ via on‑disk cache¹ |
|
|
326
|
+
| SARIF / GitHub Code Scanning | ✅ | ➖ | ✅ |
|
|
327
|
+
| Report formats | text, xml, json, sarif, csv, markdown, ai, shields | text, xml, csv, vs | many |
|
|
328
|
+
| PMD CLI flags (`--file-list`, `--non-recursive`, `--skip-duplicate-files`, `--skip-lexical-errors`) | ✅ | ✅ | ➖ |
|
|
329
|
+
| `.gitignore` aware | ✅ (on by default, prunes walk) | ➖ | ✅ |
|
|
223
330
|
| Install size | tiny (1 dep) | JVM required | npm package |
|
|
224
331
|
|
|
332
|
+
¹ jscpd derives "new vs known" from a persistent store (LevelDB) that you must keep between runs; clone-alert commits a small, reviewable JSON baseline and stays stateless.
|
|
333
|
+
|
|
225
334
|
## Development
|
|
226
335
|
|
|
227
336
|
```sh
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { Cpd, Match } from './index';
|
|
2
|
+
/** One accepted clone, as persisted in the baseline file. */
|
|
3
|
+
export interface CloneRecord {
|
|
4
|
+
fingerprint: string;
|
|
5
|
+
tokens: number;
|
|
6
|
+
files: string[];
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Stable content fingerprint of a match: see {@link hashImages}. Both occurrences
|
|
10
|
+
* of a match share identical span content, so the choice of mark does not matter.
|
|
11
|
+
*/
|
|
12
|
+
export declare function fingerprint(cpd: Cpd, match: Match): string;
|
|
13
|
+
/** Fingerprints of every clone recorded in the baseline file. */
|
|
14
|
+
export declare function readBaseline(baselinePath: string): Set<string>;
|
|
15
|
+
/**
|
|
16
|
+
* Serialize the accepted clones to the baseline file. Entries are sorted by
|
|
17
|
+
* fingerprint and carry no line/column, so the file stays a stable, reviewable,
|
|
18
|
+
* churn-free diff as long as the duplicated content itself does not change.
|
|
19
|
+
*/
|
|
20
|
+
export declare function writeBaseline(baselinePath: string, clones: CloneRecord[]): void;
|
package/dist/baseline.js
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.fingerprint = fingerprint;
|
|
37
|
+
exports.readBaseline = readBaseline;
|
|
38
|
+
exports.writeBaseline = writeBaseline;
|
|
39
|
+
/**
|
|
40
|
+
* Baseline support for the CLI: the content-fingerprint hash plus reading and
|
|
41
|
+
* writing the baseline JSON file. This is purely an adoption/CI concern, kept out
|
|
42
|
+
* of the core engine (a faithful PMD port) and the tokenizers.
|
|
43
|
+
*
|
|
44
|
+
* @packageDocumentation
|
|
45
|
+
*/
|
|
46
|
+
const fs = __importStar(require("node:fs"));
|
|
47
|
+
/**
|
|
48
|
+
* Stable content fingerprint of a match: see {@link hashImages}. Both occurrences
|
|
49
|
+
* of a match share identical span content, so the choice of mark does not matter.
|
|
50
|
+
*/
|
|
51
|
+
function fingerprint(cpd, match) {
|
|
52
|
+
return hashImages(cpd.spanImages(match));
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* 64-bit hash (16 hex chars) over the token *images* only — no file path, no
|
|
56
|
+
* line/column — so it is unchanged when the duplicated code moves within or
|
|
57
|
+
* between files. Deterministic across runs and machines (a pure function of the
|
|
58
|
+
* image strings). A boundary byte between tokens keeps `["ab","c"]` distinct from
|
|
59
|
+
* `["a","bc"]`.
|
|
60
|
+
*/
|
|
61
|
+
function hashImages(images) {
|
|
62
|
+
let a = 0x811c9dc5 | 0; // FNV-1a basis
|
|
63
|
+
let b = 0x85ebca6b | 0; // second lane, distinct multiplier
|
|
64
|
+
for (const image of images) {
|
|
65
|
+
for (let c = 0; c < image.length; c++) {
|
|
66
|
+
const ch = image.charCodeAt(c);
|
|
67
|
+
a = Math.imul(a ^ ch, 0x01000193);
|
|
68
|
+
b = Math.imul(b ^ ch, 0xc2b2ae35);
|
|
69
|
+
}
|
|
70
|
+
a = Math.imul(a ^ 0xff, 0x01000193);
|
|
71
|
+
b = Math.imul(b ^ 0xff, 0xc2b2ae35);
|
|
72
|
+
}
|
|
73
|
+
return toHex8(a) + toHex8(b);
|
|
74
|
+
}
|
|
75
|
+
function toHex8(h) {
|
|
76
|
+
return (h >>> 0).toString(16).padStart(8, '0');
|
|
77
|
+
}
|
|
78
|
+
/** Fingerprints of every clone recorded in the baseline file. */
|
|
79
|
+
function readBaseline(baselinePath) {
|
|
80
|
+
if (!fs.existsSync(baselinePath)) {
|
|
81
|
+
throw new Error(`baseline file not found: ${baselinePath} (run with --update-baseline to create it)`);
|
|
82
|
+
}
|
|
83
|
+
let parsed;
|
|
84
|
+
try {
|
|
85
|
+
parsed = JSON.parse(fs.readFileSync(baselinePath, 'utf-8'));
|
|
86
|
+
}
|
|
87
|
+
catch {
|
|
88
|
+
throw new Error(`baseline file is not valid JSON: ${baselinePath}`);
|
|
89
|
+
}
|
|
90
|
+
const fingerprints = new Set();
|
|
91
|
+
for (const clone of parsed.clones ?? []) {
|
|
92
|
+
if (clone.fingerprint)
|
|
93
|
+
fingerprints.add(clone.fingerprint);
|
|
94
|
+
}
|
|
95
|
+
return fingerprints;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Serialize the accepted clones to the baseline file. Entries are sorted by
|
|
99
|
+
* fingerprint and carry no line/column, so the file stays a stable, reviewable,
|
|
100
|
+
* churn-free diff as long as the duplicated content itself does not change.
|
|
101
|
+
*/
|
|
102
|
+
function writeBaseline(baselinePath, clones) {
|
|
103
|
+
const sorted = [...clones].sort((x, y) => x.fingerprint < y.fingerprint ? -1 : x.fingerprint > y.fingerprint ? 1 : 0);
|
|
104
|
+
fs.writeFileSync(baselinePath, `${JSON.stringify({ version: 1, clones: sorted }, null, 2)}\n`);
|
|
105
|
+
}
|
package/dist/cli.d.ts
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import { collectFiles } from './files';
|
|
2
3
|
import { type CpdOptions } from './index';
|
|
3
|
-
type ReportFormat = 'text' | 'xml' | 'json';
|
|
4
|
+
type ReportFormat = 'text' | 'xml' | 'json' | 'sarif' | 'csv' | 'csv_with_linecount_per_file' | 'markdown' | 'ai' | 'shields';
|
|
4
5
|
interface CliOptions extends CpdOptions {
|
|
5
6
|
paths: string[];
|
|
6
7
|
extensions: Set<string>;
|
|
7
8
|
excludePatterns: string[];
|
|
9
|
+
respectGitignore: boolean;
|
|
10
|
+
nonRecursive: boolean;
|
|
11
|
+
skipDuplicateFiles: boolean;
|
|
12
|
+
skipLexicalErrors: boolean;
|
|
8
13
|
format: ReportFormat;
|
|
9
14
|
failOnViolation: boolean;
|
|
15
|
+
baselinePath?: string;
|
|
16
|
+
updateBaseline: boolean;
|
|
10
17
|
}
|
|
11
18
|
declare function main(argv: string[]): number;
|
|
12
19
|
declare function parseArgs(argv: string[]): CliOptions;
|
|
13
|
-
declare function collectFiles(paths: string[], extensions: Set<string>, excludePatterns?: string[]): string[];
|
|
14
20
|
export { collectFiles, main, parseArgs };
|