kindred-drift 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +139 -0
- package/bin/kindred.js +4 -0
- package/package.json +39 -0
- package/src/cli.js +125 -0
- package/src/cluster.js +149 -0
- package/src/diff.js +111 -0
- package/src/normalize.js +44 -0
- package/src/render.js +91 -0
- package/src/scan.js +158 -0
- package/src/similarity.js +96 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ben Malaga
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# kindred
|
|
4
|
+
|
|
5
|
+
**Template-less drift detection for a folder of sibling repos.**
|
|
6
|
+
|
|
7
|
+
You never declared a template, but your repos clearly share one. kindred finds it, and shows you where each repo has wandered off.
|
|
8
|
+
|
|
9
|
+
[](https://github.com/BenMalaga/kindred/actions/workflows/test.yml)
|
|
10
|
+
[](https://github.com/BenMalaga/kindred/releases)
|
|
11
|
+
[](https://nodejs.org)
|
|
12
|
+
[](package.json)
|
|
13
|
+
[](LICENSE)
|
|
14
|
+
|
|
15
|
+
</div>
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
If you maintain a handful of small repos, you know the pattern. You copy a LICENSE, a `.gitignore`, a CI workflow from the last project into the new one. Six months later one repo has a fixed workflow, another has a stale `.gitignore`, a third never got `CONTRIBUTING.md` at all. There is a shared template in there somewhere. It just lives in your head.
|
|
20
|
+
|
|
21
|
+
kindred scans a folder of sibling git repos, clusters the files they share, and prints a drift matrix: which files are identical everywhere, which have drifted (and by how much), and which repos are missing them entirely. No template file, no manifest, no config. The repos themselves are the template.
|
|
22
|
+
|
|
23
|
+
It is strictly read-only. kindred never writes a single byte inside the repos it scans.
|
|
24
|
+
|
|
25
|
+
## A real run
|
|
26
|
+
|
|
27
|
+
This is actual output from running kindred on the folder where kindred itself was built, alongside its sibling projects:
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
$ kindred scan "/Users/benmalaga/Github Projects"
|
|
31
|
+
4 repos under /Users/benmalaga/Github Projects:
|
|
32
|
+
claudemd-check, lockbisect, nopus, wastegate
|
|
33
|
+
|
|
34
|
+
file claudemd-check lockbisect nopus wastegate
|
|
35
|
+
-------------------------- -------------- ---------- ----- ---------
|
|
36
|
+
.github/workflows/test.yml = = -- =
|
|
37
|
+
.gitignore ~75% = = =
|
|
38
|
+
CONTRIBUTING.md -- = ~24% ~33%
|
|
39
|
+
LICENSE = = = =
|
|
40
|
+
README.md = ~30% ~27% ~33%
|
|
41
|
+
package.json = ~58% ~56% ~57%
|
|
42
|
+
src/cli.js = ~26% ~14% ~31%
|
|
43
|
+
|
|
44
|
+
legend: = identical to reference ~NN% drifted (similarity) -- missing
|
|
45
|
+
7 shared files across 4 repos: 1 identical everywhere, 5 drifted, 2 missing somewhere
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
One glance tells the whole story: the LICENSE is in sync everywhere, `nopus` never got a CI workflow, `claudemd-check` is missing `CONTRIBUTING.md`, and one repo's `.gitignore` has drifted from the other three. To see exactly how, ask for the diff:
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
$ kindred diff .gitignore "/Users/benmalaga/Github Projects"
|
|
52
|
+
--- lockbisect/.gitignore
|
|
53
|
+
+++ claudemd-check/.gitignore
|
|
54
|
+
@@ -1,4 +1,4 @@
|
|
55
|
+
node_modules/
|
|
56
|
+
*.log
|
|
57
|
+
.DS_Store
|
|
58
|
+
-*.tgz
|
|
59
|
+
+.claudemd-baseline.json
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
No install needed:
|
|
65
|
+
|
|
66
|
+
```sh
|
|
67
|
+
npx kindred-drift scan ~/projects
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Or install globally:
|
|
71
|
+
|
|
72
|
+
```sh
|
|
73
|
+
npm install -g kindred-drift
|
|
74
|
+
kindred scan ~/projects
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Or run straight from a clone (there are zero dependencies, so there is no `npm install` step):
|
|
78
|
+
|
|
79
|
+
```sh
|
|
80
|
+
git clone https://github.com/BenMalaga/kindred.git
|
|
81
|
+
node kindred/bin/kindred.js scan ~/projects
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
> **Why `kindred-drift`?** The npm names `kindred` (a 2012 blogging engine) and `kindred-cli` were both already taken, so the package is published as `kindred-drift`. The binary is still `kindred`.
|
|
85
|
+
|
|
86
|
+
Requires Node 18 or newer. Zero runtime dependencies, zero dev dependencies.
|
|
87
|
+
|
|
88
|
+
## Usage
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
kindred scan [dir] [--json] scan sibling git repos under dir (default: cwd)
|
|
92
|
+
and print the drift matrix
|
|
93
|
+
kindred diff <relpath> [dir] unified diff between the variants of a shared file
|
|
94
|
+
kindred --help show help
|
|
95
|
+
kindred --version show version
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
`scan` looks at the immediate subdirectories of `dir`, keeps the ones that are git repos, and compares the files they share. `--json` emits the full result (statuses and similarity scores per repo) for scripting.
|
|
99
|
+
|
|
100
|
+
`diff` shows a unified diff between the reference variant of a shared file and every other variant. If all copies are identical it says so and exits 0. If the file is not shared, it exits 1.
|
|
101
|
+
|
|
102
|
+
## How it works
|
|
103
|
+
|
|
104
|
+
1. **Discover.** Immediate subdirectories of the target folder that contain `.git` are treated as sibling repos. Everything else is ignored, including half-checked-out folders that are not repos yet.
|
|
105
|
+
|
|
106
|
+
2. **Collect.** Each repo is walked, skipping `node_modules`, `.git`, `dist`, `build`, `coverage`, and friends. Binary files, lockfiles (`package-lock.json`, `yarn.lock`, and other machine-generated files), and files over 1 MiB are excluded.
|
|
107
|
+
|
|
108
|
+
3. **Cluster.** A file becomes a comparison cluster if its relative path appears in 2 or more repos, or if it is a well-known shareable file (LICENSE, `.gitignore`, anything under `.github/`, `tsconfig.json`, `.eslintrc*`, `CONTRIBUTING.md`, and so on) present in at least one repo. Well-known files are reported even when only one repo has them, because absence is drift too.
|
|
109
|
+
|
|
110
|
+
4. **Normalize.** Before comparison, content is normalized: CRLF becomes LF, trailing whitespace is stripped, and runs of blank lines collapse to one. Two files that differ only cosmetically count as identical.
|
|
111
|
+
|
|
112
|
+
5. **Compare.** Within each cluster, identical normalized content is grouped into variants. The variant held by the most repos becomes the reference (ties break toward the alphabetically first repo). Every other variant gets a similarity score against the reference: the normalized diff ratio `2 * LCS(a, b) / (|a| + |b|)` over lines, the same family of metric as Python's `difflib.ratio`. It is order-sensitive on purpose: reordered config is changed config. For very large file pairs, kindred falls back to a linear-time multiset overlap ratio.
|
|
113
|
+
|
|
114
|
+
6. **Report.** Each repo's cell in the matrix is one of: `=` (holds the reference variant), `~NN%` (drifted, with similarity), or `--` (missing).
|
|
115
|
+
|
|
116
|
+
## Why not cruft, copier, or repo-file-sync-action?
|
|
117
|
+
|
|
118
|
+
Those are all excellent tools, and they all share one assumption: a declared template that exists before your repos do. [cruft](https://github.com/cruft/cruft) and [copier](https://github.com/copier-org/copier) check projects against a cookiecutter-style template repo. [repo-file-sync-action](https://github.com/BetaHuhn/repo-file-sync-action) pushes files from a designated source repo to targets you enumerate in a config file.
|
|
119
|
+
|
|
120
|
+
kindred starts from the opposite end. Most people with five sibling repos never made a template; they made five repos that rhyme. kindred infers the implicit template you already have, by observation, and tells you where reality disagrees with it. There is nothing to set up, nothing to declare, and nothing to keep in sync about the syncing tool itself. Run one command in a folder you already have.
|
|
121
|
+
|
|
122
|
+
If kindred convinces you that you want a real template, great: graduate to copier. Until then, you do not need one to see your drift.
|
|
123
|
+
|
|
124
|
+
## Roadmap
|
|
125
|
+
|
|
126
|
+
v1 is deliberately read-only: scan, matrix, diff. Planned next:
|
|
127
|
+
|
|
128
|
+
- **`kindred bless <relpath> <repo>`**: mark one repo's variant as the canonical one for that file, recorded outside the scanned repos.
|
|
129
|
+
- **`kindred apply <relpath>`**: copy the blessed (or reference) variant into the repos that drifted or lack the file, with a dry-run by default and per-repo confirmation. This will be the first and only command that writes into scanned repos, and it will be loudly explicit about it.
|
|
130
|
+
- Similarity-based clustering of files whose paths differ but whose content rhymes (the `ci.yml` in one repo that is really the `test.yml` from the others).
|
|
131
|
+
- `--fail-on-drift` exit codes for CI.
|
|
132
|
+
|
|
133
|
+
## Contributing
|
|
134
|
+
|
|
135
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). The short version: zero dependencies, Node 18+, `node --test` must stay green, and scanning must stay read-only.
|
|
136
|
+
|
|
137
|
+
## License
|
|
138
|
+
|
|
139
|
+
[MIT](LICENSE) (c) 2026 Ben Malaga
|
package/bin/kindred.js
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "kindred-drift",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Template-less drift detection for a folder of sibling repos",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"kindred": "bin/kindred.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"bin",
|
|
11
|
+
"src"
|
|
12
|
+
],
|
|
13
|
+
"scripts": {
|
|
14
|
+
"test": "node --test"
|
|
15
|
+
},
|
|
16
|
+
"engines": {
|
|
17
|
+
"node": ">=18"
|
|
18
|
+
},
|
|
19
|
+
"keywords": [
|
|
20
|
+
"drift",
|
|
21
|
+
"sync",
|
|
22
|
+
"cli",
|
|
23
|
+
"devtools",
|
|
24
|
+
"repos",
|
|
25
|
+
"template",
|
|
26
|
+
"diff",
|
|
27
|
+
"config"
|
|
28
|
+
],
|
|
29
|
+
"author": "Ben Malaga",
|
|
30
|
+
"license": "MIT",
|
|
31
|
+
"repository": {
|
|
32
|
+
"type": "git",
|
|
33
|
+
"url": "git+https://github.com/BenMalaga/kindred.git"
|
|
34
|
+
},
|
|
35
|
+
"bugs": {
|
|
36
|
+
"url": "https://github.com/BenMalaga/kindred/issues"
|
|
37
|
+
},
|
|
38
|
+
"homepage": "https://github.com/BenMalaga/kindred#readme"
|
|
39
|
+
}
|
package/src/cli.js
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
// CLI entry: argument parsing and command dispatch.
|
|
2
|
+
|
|
3
|
+
import { readFileSync } from 'node:fs';
|
|
4
|
+
import { resolve } from 'node:path';
|
|
5
|
+
import { fileURLToPath } from 'node:url';
|
|
6
|
+
|
|
7
|
+
import { unifiedDiff } from './diff.js';
|
|
8
|
+
import { renderMatrix, toJSON } from './render.js';
|
|
9
|
+
import { scan } from './scan.js';
|
|
10
|
+
|
|
11
|
+
const HELP = `kindred: template-less drift detection for a folder of sibling repos
|
|
12
|
+
|
|
13
|
+
usage:
|
|
14
|
+
kindred scan [dir] [--json] scan sibling git repos under dir (default: cwd)
|
|
15
|
+
and print the drift matrix
|
|
16
|
+
kindred diff <relpath> [dir] unified diff between the variants of a shared file
|
|
17
|
+
kindred --help show this help
|
|
18
|
+
kindred --version show version
|
|
19
|
+
|
|
20
|
+
examples:
|
|
21
|
+
kindred scan ~/projects
|
|
22
|
+
kindred diff .gitignore ~/projects
|
|
23
|
+
npx kindred-drift scan .
|
|
24
|
+
`;
|
|
25
|
+
|
|
26
|
+
function version() {
|
|
27
|
+
const pkg = JSON.parse(
|
|
28
|
+
readFileSync(fileURLToPath(new URL('../package.json', import.meta.url)), 'utf8')
|
|
29
|
+
);
|
|
30
|
+
return pkg.version;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function cmdScan(args) {
|
|
34
|
+
const json = args.includes('--json');
|
|
35
|
+
const positional = args.filter((a) => !a.startsWith('--'));
|
|
36
|
+
const root = resolve(positional[0] ?? '.');
|
|
37
|
+
const result = scan(root);
|
|
38
|
+
if (json) {
|
|
39
|
+
process.stdout.write(`${JSON.stringify(toJSON(result), null, 2)}\n`);
|
|
40
|
+
return 0;
|
|
41
|
+
}
|
|
42
|
+
process.stdout.write(`${result.repos.length} repos under ${root}:\n`);
|
|
43
|
+
process.stdout.write(` ${result.repos.join(', ')}\n\n`);
|
|
44
|
+
process.stdout.write(`${renderMatrix(result)}\n`);
|
|
45
|
+
return 0;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function cmdDiff(args) {
|
|
49
|
+
const positional = args.filter((a) => !a.startsWith('--'));
|
|
50
|
+
const relpath = positional[0];
|
|
51
|
+
if (!relpath) {
|
|
52
|
+
process.stderr.write('usage: kindred diff <relpath> [dir]\n');
|
|
53
|
+
return 2;
|
|
54
|
+
}
|
|
55
|
+
const root = resolve(positional[1] ?? '.');
|
|
56
|
+
const result = scan(root);
|
|
57
|
+
const cluster = result.clusters.find((c) => c.relpath === relpath);
|
|
58
|
+
if (!cluster) {
|
|
59
|
+
process.stderr.write(`kindred: ${relpath} is not a shared file under ${root}\n`);
|
|
60
|
+
process.stderr.write('hint: run "kindred scan" to list shared files\n');
|
|
61
|
+
return 1;
|
|
62
|
+
}
|
|
63
|
+
if (cluster.variantCount === 1) {
|
|
64
|
+
const holders = [...cluster.statuses.entries()]
|
|
65
|
+
.filter(([, e]) => e.status !== 'missing')
|
|
66
|
+
.map(([repo]) => repo);
|
|
67
|
+
process.stdout.write(
|
|
68
|
+
`${relpath}: all ${holders.length} copies identical (${holders.join(', ')})\n`
|
|
69
|
+
);
|
|
70
|
+
return 0;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Diff the reference variant against one representative of each other variant.
|
|
74
|
+
const ref = cluster.members.get(cluster.referenceRepo);
|
|
75
|
+
const seen = new Set([ref.hash]);
|
|
76
|
+
const chunks = [];
|
|
77
|
+
for (const [repo, entry] of cluster.members) {
|
|
78
|
+
if (seen.has(entry.hash)) continue;
|
|
79
|
+
seen.add(entry.hash);
|
|
80
|
+
chunks.push(
|
|
81
|
+
unifiedDiff(
|
|
82
|
+
ref.lines,
|
|
83
|
+
entry.lines,
|
|
84
|
+
`${cluster.referenceRepo}/${relpath}`,
|
|
85
|
+
`${repo}/${relpath}`
|
|
86
|
+
)
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
process.stdout.write(`${chunks.join('\n\n')}\n`);
|
|
90
|
+
return 0;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Run the CLI.
|
|
95
|
+
* @param {string[]} argv process.argv.slice(2)
|
|
96
|
+
* @returns {number} exit code
|
|
97
|
+
*/
|
|
98
|
+
export function main(argv) {
|
|
99
|
+
const [command, ...rest] = argv;
|
|
100
|
+
try {
|
|
101
|
+
switch (command) {
|
|
102
|
+
case 'scan':
|
|
103
|
+
return cmdScan(rest);
|
|
104
|
+
case 'diff':
|
|
105
|
+
return cmdDiff(rest);
|
|
106
|
+
case '--version':
|
|
107
|
+
case '-v':
|
|
108
|
+
process.stdout.write(`${version()}\n`);
|
|
109
|
+
return 0;
|
|
110
|
+
case undefined:
|
|
111
|
+
case '--help':
|
|
112
|
+
case '-h':
|
|
113
|
+
case 'help':
|
|
114
|
+
process.stdout.write(HELP);
|
|
115
|
+
return command === undefined ? 2 : 0;
|
|
116
|
+
default:
|
|
117
|
+
process.stderr.write(`kindred: unknown command "${command}"\n\n`);
|
|
118
|
+
process.stderr.write(HELP);
|
|
119
|
+
return 2;
|
|
120
|
+
}
|
|
121
|
+
} catch (err) {
|
|
122
|
+
process.stderr.write(`kindred: ${err.message}\n`);
|
|
123
|
+
return 1;
|
|
124
|
+
}
|
|
125
|
+
}
|
package/src/cluster.js
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
// Clustering: decide which files are comparable across sibling repos, and
|
|
2
|
+
// compute a per-repo status for each cluster.
|
|
3
|
+
//
|
|
4
|
+
// A cluster is keyed by relative path. A relative path becomes a cluster if:
|
|
5
|
+
// - it exists in 2 or more repos (the implicit shared template), or
|
|
6
|
+
// - it is a well-known shareable file (LICENSE, .gitignore, anything under
|
|
7
|
+
// .github/, etc.) present in at least one repo. Absence of a well-known
|
|
8
|
+
// file is drift too, so those clusters are reported even with one member.
|
|
9
|
+
//
|
|
10
|
+
// Within a cluster, identical normalized content is grouped into variants.
|
|
11
|
+
// The variant held by the most repos is the reference (ties break toward the
|
|
12
|
+
// variant containing the alphabetically first repo). Each repo is then:
|
|
13
|
+
// - identical: holds the reference variant
|
|
14
|
+
// - drifted: holds a different variant (with similarity vs the reference)
|
|
15
|
+
// - missing: does not have the file
|
|
16
|
+
|
|
17
|
+
import { similarity } from './similarity.js';
|
|
18
|
+
|
|
19
|
+
export const WELL_KNOWN_BASENAMES = new Set([
|
|
20
|
+
'LICENSE',
|
|
21
|
+
'LICENSE.md',
|
|
22
|
+
'LICENSE.txt',
|
|
23
|
+
'LICENCE',
|
|
24
|
+
'COPYING',
|
|
25
|
+
'CONTRIBUTING.md',
|
|
26
|
+
'CODE_OF_CONDUCT.md',
|
|
27
|
+
'SECURITY.md',
|
|
28
|
+
'.gitignore',
|
|
29
|
+
'.gitattributes',
|
|
30
|
+
'.editorconfig',
|
|
31
|
+
'.npmrc',
|
|
32
|
+
'.nvmrc',
|
|
33
|
+
'.node-version',
|
|
34
|
+
'.dockerignore',
|
|
35
|
+
'Dockerfile',
|
|
36
|
+
'Makefile',
|
|
37
|
+
'tsconfig.json',
|
|
38
|
+
'jsconfig.json',
|
|
39
|
+
'eslint.config.js',
|
|
40
|
+
'eslint.config.mjs',
|
|
41
|
+
'eslint.config.cjs',
|
|
42
|
+
'prettier.config.js',
|
|
43
|
+
'prettier.config.mjs',
|
|
44
|
+
'rollup.config.js',
|
|
45
|
+
'vitest.config.js',
|
|
46
|
+
'vitest.config.ts',
|
|
47
|
+
'jest.config.js',
|
|
48
|
+
'babel.config.js',
|
|
49
|
+
'renovate.json',
|
|
50
|
+
'.releaserc'
|
|
51
|
+
]);
|
|
52
|
+
|
|
53
|
+
const WELL_KNOWN_PATTERNS = [
|
|
54
|
+
/^\.eslintrc(\..+)?$/u,
|
|
55
|
+
/^\.prettierrc(\..+)?$/u,
|
|
56
|
+
/^\.babelrc(\..+)?$/u,
|
|
57
|
+
/^\.stylelintrc(\..+)?$/u
|
|
58
|
+
];
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Is this relative path a well-known shareable file?
|
|
62
|
+
* @param {string} relpath posix-style relative path within a repo
|
|
63
|
+
* @returns {boolean}
|
|
64
|
+
*/
|
|
65
|
+
export function isWellKnown(relpath) {
|
|
66
|
+
if (relpath.startsWith('.github/')) return true;
|
|
67
|
+
const base = relpath.includes('/') ? relpath.slice(relpath.lastIndexOf('/') + 1) : relpath;
|
|
68
|
+
if (WELL_KNOWN_BASENAMES.has(base)) return true;
|
|
69
|
+
return WELL_KNOWN_PATTERNS.some((re) => re.test(base));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Build the list of comparable relative paths.
|
|
74
|
+
* @param {Map<string, Set<string>>} repoPaths repo name -> set of relpaths
|
|
75
|
+
* @returns {string[]} sorted cluster keys
|
|
76
|
+
*/
|
|
77
|
+
export function clusterPaths(repoPaths) {
|
|
78
|
+
const counts = new Map();
|
|
79
|
+
for (const paths of repoPaths.values()) {
|
|
80
|
+
for (const relpath of paths) {
|
|
81
|
+
counts.set(relpath, (counts.get(relpath) ?? 0) + 1);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
const keys = [];
|
|
85
|
+
for (const [relpath, count] of counts) {
|
|
86
|
+
if (count >= 2 || isWellKnown(relpath)) keys.push(relpath);
|
|
87
|
+
}
|
|
88
|
+
return keys.sort();
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Compute variant groups and per-repo status for one cluster.
|
|
93
|
+
* @param {string} relpath the cluster key
|
|
94
|
+
* @param {Map<string, {hash: string, lines: string[]}>} members
|
|
95
|
+
* repo name -> normalized content for repos that have the file
|
|
96
|
+
* @param {string[]} allRepos every repo in the scan, sorted
|
|
97
|
+
* @param {(a: string[], b: string[]) => number} [simFn] similarity function
|
|
98
|
+
* @returns {{
|
|
99
|
+
* relpath: string,
|
|
100
|
+
* variantCount: number,
|
|
101
|
+
* referenceRepo: string,
|
|
102
|
+
* statuses: Map<string, {status: 'identical'|'drifted'|'missing', similarity: number|null}>
|
|
103
|
+
* }}
|
|
104
|
+
*/
|
|
105
|
+
export function clusterStatus(relpath, members, allRepos, simFn = similarity) {
|
|
106
|
+
// Group repos by normalized content hash.
|
|
107
|
+
const variants = new Map(); // hash -> repo names (sorted because allRepos is)
|
|
108
|
+
for (const repo of allRepos) {
|
|
109
|
+
const entry = members.get(repo);
|
|
110
|
+
if (!entry) continue;
|
|
111
|
+
const list = variants.get(entry.hash);
|
|
112
|
+
if (list) list.push(repo);
|
|
113
|
+
else variants.set(entry.hash, [repo]);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Reference variant: most repos; ties break toward the variant whose first
|
|
117
|
+
// (alphabetically smallest) repo sorts first.
|
|
118
|
+
let refHash = null;
|
|
119
|
+
let refRepos = [];
|
|
120
|
+
for (const [hash, repos] of variants) {
|
|
121
|
+
if (
|
|
122
|
+
repos.length > refRepos.length ||
|
|
123
|
+
(repos.length === refRepos.length && (refRepos.length === 0 || repos[0] < refRepos[0]))
|
|
124
|
+
) {
|
|
125
|
+
refHash = hash;
|
|
126
|
+
refRepos = repos;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
const refLines = members.get(refRepos[0]).lines;
|
|
130
|
+
|
|
131
|
+
const statuses = new Map();
|
|
132
|
+
for (const repo of allRepos) {
|
|
133
|
+
const entry = members.get(repo);
|
|
134
|
+
if (!entry) {
|
|
135
|
+
statuses.set(repo, { status: 'missing', similarity: null });
|
|
136
|
+
} else if (entry.hash === refHash) {
|
|
137
|
+
statuses.set(repo, { status: 'identical', similarity: 1 });
|
|
138
|
+
} else {
|
|
139
|
+
statuses.set(repo, { status: 'drifted', similarity: simFn(refLines, entry.lines) });
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
relpath,
|
|
145
|
+
variantCount: variants.size,
|
|
146
|
+
referenceRepo: refRepos[0],
|
|
147
|
+
statuses
|
|
148
|
+
};
|
|
149
|
+
}
|
package/src/diff.js
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
// Minimal unified diff between two line arrays. Zero dependencies.
|
|
2
|
+
// Classic LCS backtrack for edit script, then hunk assembly with context.
|
|
3
|
+
|
|
4
|
+
const DIFF_CELL_LIMIT = 4_000_000;
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Edit script between two line arrays.
|
|
8
|
+
* @param {string[]} a
|
|
9
|
+
* @param {string[]} b
|
|
10
|
+
* @returns {Array<{type: 'eq'|'del'|'add', line: string}>}
|
|
11
|
+
*/
|
|
12
|
+
export function diffOps(a, b) {
|
|
13
|
+
// Degenerate and oversized cases: whole-file replace.
|
|
14
|
+
if (a.length * b.length > DIFF_CELL_LIMIT) {
|
|
15
|
+
return [
|
|
16
|
+
...a.map((line) => ({ type: 'del', line })),
|
|
17
|
+
...b.map((line) => ({ type: 'add', line }))
|
|
18
|
+
];
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Full LCS table (Uint32, one row per a-line) for backtracking.
|
|
22
|
+
const rows = a.length + 1;
|
|
23
|
+
const cols = b.length + 1;
|
|
24
|
+
const table = new Uint32Array(rows * cols);
|
|
25
|
+
for (let i = 1; i < rows; i++) {
|
|
26
|
+
const ai = a[i - 1];
|
|
27
|
+
for (let j = 1; j < cols; j++) {
|
|
28
|
+
table[i * cols + j] =
|
|
29
|
+
ai === b[j - 1]
|
|
30
|
+
? table[(i - 1) * cols + (j - 1)] + 1
|
|
31
|
+
: Math.max(table[(i - 1) * cols + j], table[i * cols + (j - 1)]);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const ops = [];
|
|
36
|
+
let i = a.length;
|
|
37
|
+
let j = b.length;
|
|
38
|
+
while (i > 0 && j > 0) {
|
|
39
|
+
if (a[i - 1] === b[j - 1]) {
|
|
40
|
+
ops.push({ type: 'eq', line: a[i - 1] });
|
|
41
|
+
i--;
|
|
42
|
+
j--;
|
|
43
|
+
} else if (table[(i - 1) * cols + j] > table[i * cols + (j - 1)]) {
|
|
44
|
+
ops.push({ type: 'del', line: a[i - 1] });
|
|
45
|
+
i--;
|
|
46
|
+
} else {
|
|
47
|
+
ops.push({ type: 'add', line: b[j - 1] });
|
|
48
|
+
j--;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
while (i > 0) ops.push({ type: 'del', line: a[--i] });
|
|
52
|
+
while (j > 0) ops.push({ type: 'add', line: b[--j] });
|
|
53
|
+
return ops.reverse();
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Render a unified diff with @@ hunks.
|
|
58
|
+
* @param {string[]} a old lines
|
|
59
|
+
* @param {string[]} b new lines
|
|
60
|
+
* @param {string} aLabel
|
|
61
|
+
* @param {string} bLabel
|
|
62
|
+
* @param {number} [context]
|
|
63
|
+
* @returns {string} empty string when a and b are identical
|
|
64
|
+
*/
|
|
65
|
+
export function unifiedDiff(a, b, aLabel, bLabel, context = 3) {
|
|
66
|
+
const ops = diffOps(a, b);
|
|
67
|
+
if (ops.every((op) => op.type === 'eq')) return '';
|
|
68
|
+
|
|
69
|
+
// Annotate ops with line numbers.
|
|
70
|
+
let aLine = 1;
|
|
71
|
+
let bLine = 1;
|
|
72
|
+
const annotated = ops.map((op) => {
|
|
73
|
+
const entry = { ...op, aLine, bLine };
|
|
74
|
+
if (op.type !== 'add') aLine++;
|
|
75
|
+
if (op.type !== 'del') bLine++;
|
|
76
|
+
return entry;
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
// Group changed op indexes into hunks separated by > 2*context equal lines.
|
|
80
|
+
const changed = [];
|
|
81
|
+
annotated.forEach((op, idx) => {
|
|
82
|
+
if (op.type !== 'eq') changed.push(idx);
|
|
83
|
+
});
|
|
84
|
+
const groups = [];
|
|
85
|
+
let group = [changed[0]];
|
|
86
|
+
for (let k = 1; k < changed.length; k++) {
|
|
87
|
+
if (changed[k] - changed[k - 1] > context * 2) {
|
|
88
|
+
groups.push(group);
|
|
89
|
+
group = [];
|
|
90
|
+
}
|
|
91
|
+
group.push(changed[k]);
|
|
92
|
+
}
|
|
93
|
+
groups.push(group);
|
|
94
|
+
|
|
95
|
+
const out = [`--- ${aLabel}`, `+++ ${bLabel}`];
|
|
96
|
+
for (const g of groups) {
|
|
97
|
+
const start = Math.max(0, g[0] - context);
|
|
98
|
+
const end = Math.min(annotated.length - 1, g[g.length - 1] + context);
|
|
99
|
+
const slice = annotated.slice(start, end + 1);
|
|
100
|
+
const aCount = slice.filter((op) => op.type !== 'add').length;
|
|
101
|
+
const bCount = slice.filter((op) => op.type !== 'del').length;
|
|
102
|
+
const aStart = aCount === 0 ? slice[0].aLine - 1 : slice[0].aLine;
|
|
103
|
+
const bStart = bCount === 0 ? slice[0].bLine - 1 : slice[0].bLine;
|
|
104
|
+
out.push(`@@ -${aStart},${aCount} +${bStart},${bCount} @@`);
|
|
105
|
+
for (const op of slice) {
|
|
106
|
+
const prefix = op.type === 'eq' ? ' ' : op.type === 'del' ? '-' : '+';
|
|
107
|
+
out.push(prefix + op.line);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
return out.join('\n');
|
|
111
|
+
}
|
package/src/normalize.js
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
// Content normalization: makes cosmetic differences invisible to comparison.
|
|
2
|
+
// Two files that differ only in trailing whitespace, CRLF vs LF, or the
|
|
3
|
+
// number of consecutive blank lines are treated as identical.
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Normalize file content into an array of lines.
|
|
7
|
+
* - Converts CRLF / CR line endings to LF.
|
|
8
|
+
* - Strips trailing whitespace from every line.
|
|
9
|
+
* - Collapses runs of 2+ blank lines into a single blank line.
|
|
10
|
+
* - Drops leading and trailing blank lines.
|
|
11
|
+
* @param {string} text raw file content
|
|
12
|
+
* @returns {string[]} normalized lines
|
|
13
|
+
*/
|
|
14
|
+
export function normalizeLines(text) {
|
|
15
|
+
const raw = String(text)
|
|
16
|
+
.replace(/\r\n/g, '\n')
|
|
17
|
+
.replace(/\r/g, '\n')
|
|
18
|
+
.split('\n')
|
|
19
|
+
.map((line) => line.replace(/[ \t]+$/u, ''));
|
|
20
|
+
|
|
21
|
+
const out = [];
|
|
22
|
+
let prevBlank = false;
|
|
23
|
+
for (const line of raw) {
|
|
24
|
+
if (line === '') {
|
|
25
|
+
if (!prevBlank) out.push('');
|
|
26
|
+
prevBlank = true;
|
|
27
|
+
} else {
|
|
28
|
+
out.push(line);
|
|
29
|
+
prevBlank = false;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
while (out.length > 0 && out[0] === '') out.shift();
|
|
33
|
+
while (out.length > 0 && out[out.length - 1] === '') out.pop();
|
|
34
|
+
return out;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Normalize file content into a single string (lines joined by LF).
|
|
39
|
+
* @param {string} text raw file content
|
|
40
|
+
* @returns {string}
|
|
41
|
+
*/
|
|
42
|
+
export function normalizeText(text) {
|
|
43
|
+
return normalizeLines(text).join('\n');
|
|
44
|
+
}
|
package/src/render.js
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
// Text rendering of the drift matrix.
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Cell text for one repo status.
|
|
5
|
+
* @param {{status: string, similarity: number|null}} entry
|
|
6
|
+
* @returns {string}
|
|
7
|
+
*/
|
|
8
|
+
function cellText(entry) {
|
|
9
|
+
if (entry.status === 'identical') return '=';
|
|
10
|
+
if (entry.status === 'missing') return '--';
|
|
11
|
+
return `~${Math.round(entry.similarity * 100)}%`;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Render the drift matrix as a plain-text table.
|
|
16
|
+
* @param {{root: string, repos: string[], clusters: Array}} result
|
|
17
|
+
* @returns {string}
|
|
18
|
+
*/
|
|
19
|
+
export function renderMatrix(result) {
|
|
20
|
+
const { repos, clusters } = result;
|
|
21
|
+
if (clusters.length === 0) {
|
|
22
|
+
return `scanned ${repos.length} repos under ${result.root}: no shared files found`;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const pathWidth = Math.max(4, ...clusters.map((c) => c.relpath.length));
|
|
26
|
+
const colWidths = repos.map((r) =>
|
|
27
|
+
Math.max(
|
|
28
|
+
r.length,
|
|
29
|
+
...clusters.map((c) => cellText(c.statuses.get(r)).length)
|
|
30
|
+
)
|
|
31
|
+
);
|
|
32
|
+
|
|
33
|
+
const lines = [];
|
|
34
|
+
lines.push(
|
|
35
|
+
`${'file'.padEnd(pathWidth)} ${repos.map((r, i) => r.padEnd(colWidths[i])).join(' ')}`
|
|
36
|
+
);
|
|
37
|
+
lines.push(
|
|
38
|
+
`${'-'.repeat(pathWidth)} ${colWidths.map((w) => '-'.repeat(w)).join(' ')}`
|
|
39
|
+
);
|
|
40
|
+
for (const cluster of clusters) {
|
|
41
|
+
const cells = repos.map((r, i) => cellText(cluster.statuses.get(r)).padEnd(colWidths[i]));
|
|
42
|
+
lines.push(`${cluster.relpath.padEnd(pathWidth)} ${cells.join(' ')}`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Summary counts.
|
|
46
|
+
let identical = 0;
|
|
47
|
+
let drifted = 0;
|
|
48
|
+
let gaps = 0;
|
|
49
|
+
for (const cluster of clusters) {
|
|
50
|
+
const entries = [...cluster.statuses.values()];
|
|
51
|
+
const missing = entries.filter((e) => e.status === 'missing').length;
|
|
52
|
+
if (missing > 0) gaps++;
|
|
53
|
+
if (cluster.variantCount > 1) drifted++;
|
|
54
|
+
else if (missing === 0) identical++;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
lines.push('');
|
|
58
|
+
lines.push('legend: = identical to reference ~NN% drifted (similarity) -- missing');
|
|
59
|
+
lines.push(
|
|
60
|
+
`${clusters.length} shared file${clusters.length === 1 ? '' : 's'} across ${repos.length} repos: ` +
|
|
61
|
+
`${identical} identical everywhere, ${drifted} drifted, ${gaps} missing somewhere`
|
|
62
|
+
);
|
|
63
|
+
return lines.join('\n');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* JSON-serializable form of a scan result.
|
|
68
|
+
* @param {{root: string, repos: string[], clusters: Array}} result
|
|
69
|
+
* @returns {object}
|
|
70
|
+
*/
|
|
71
|
+
export function toJSON(result) {
|
|
72
|
+
return {
|
|
73
|
+
root: result.root,
|
|
74
|
+
repos: result.repos,
|
|
75
|
+
clusters: result.clusters.map((c) => ({
|
|
76
|
+
path: c.relpath,
|
|
77
|
+
variants: c.variantCount,
|
|
78
|
+
reference: c.referenceRepo,
|
|
79
|
+
repos: Object.fromEntries(
|
|
80
|
+
[...c.statuses.entries()].map(([repo, entry]) => [
|
|
81
|
+
repo,
|
|
82
|
+
{
|
|
83
|
+
status: entry.status,
|
|
84
|
+
similarity:
|
|
85
|
+
entry.similarity === null ? null : Math.round(entry.similarity * 1000) / 1000
|
|
86
|
+
}
|
|
87
|
+
])
|
|
88
|
+
)
|
|
89
|
+
}))
|
|
90
|
+
};
|
|
91
|
+
}
|
package/src/scan.js
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
// Filesystem walking and scan orchestration. Strictly read-only: this module
|
|
2
|
+
// only ever opens files for reading and never writes inside scanned repos.
|
|
3
|
+
|
|
4
|
+
import { createHash } from 'node:crypto';
|
|
5
|
+
import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs';
|
|
6
|
+
import { join } from 'node:path';
|
|
7
|
+
|
|
8
|
+
import { clusterPaths, clusterStatus } from './cluster.js';
|
|
9
|
+
import { normalizeLines } from './normalize.js';
|
|
10
|
+
|
|
11
|
+
const SKIP_DIRS = new Set([
|
|
12
|
+
'.git',
|
|
13
|
+
'node_modules',
|
|
14
|
+
'dist',
|
|
15
|
+
'build',
|
|
16
|
+
'out',
|
|
17
|
+
'coverage',
|
|
18
|
+
'vendor',
|
|
19
|
+
'target',
|
|
20
|
+
'.next',
|
|
21
|
+
'.nuxt',
|
|
22
|
+
'.turbo',
|
|
23
|
+
'.cache',
|
|
24
|
+
'.venv',
|
|
25
|
+
'venv',
|
|
26
|
+
'__pycache__',
|
|
27
|
+
'.pytest_cache',
|
|
28
|
+
'.idea'
|
|
29
|
+
]);
|
|
30
|
+
|
|
31
|
+
// Lockfiles are machine-generated and expected to differ; comparing them is
|
|
32
|
+
// noise, not drift signal.
|
|
33
|
+
const SKIP_FILES = new Set([
|
|
34
|
+
'.DS_Store',
|
|
35
|
+
'Thumbs.db',
|
|
36
|
+
'package-lock.json',
|
|
37
|
+
'npm-shrinkwrap.json',
|
|
38
|
+
'yarn.lock',
|
|
39
|
+
'pnpm-lock.yaml',
|
|
40
|
+
'bun.lockb',
|
|
41
|
+
'bun.lock',
|
|
42
|
+
'Cargo.lock',
|
|
43
|
+
'poetry.lock',
|
|
44
|
+
'uv.lock',
|
|
45
|
+
'Gemfile.lock',
|
|
46
|
+
'composer.lock',
|
|
47
|
+
'go.sum'
|
|
48
|
+
]);
|
|
49
|
+
|
|
50
|
+
const MAX_FILE_BYTES = 1024 * 1024; // 1 MiB
|
|
51
|
+
const MAX_DEPTH = 10;
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Immediate subdirectories of root that are git repos (have .git).
|
|
55
|
+
* @param {string} root
|
|
56
|
+
* @returns {Array<{name: string, path: string}>} sorted by name
|
|
57
|
+
*/
|
|
58
|
+
export function listRepos(root) {
|
|
59
|
+
const repos = [];
|
|
60
|
+
for (const entry of readdirSync(root, { withFileTypes: true })) {
|
|
61
|
+
if (!entry.isDirectory() || entry.name.startsWith('.')) continue;
|
|
62
|
+
const path = join(root, entry.name);
|
|
63
|
+
if (existsSync(join(path, '.git'))) repos.push({ name: entry.name, path });
|
|
64
|
+
}
|
|
65
|
+
return repos.sort((a, b) => (a.name < b.name ? -1 : 1));
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Quick binary sniff: NUL byte in the first 8 KiB.
|
|
70
|
+
* @param {Buffer} buf
|
|
71
|
+
* @returns {boolean}
|
|
72
|
+
*/
|
|
73
|
+
function looksBinary(buf) {
|
|
74
|
+
return buf.subarray(0, 8192).includes(0);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Collect candidate file relpaths in a repo (text files, skip list applied).
|
|
79
|
+
* @param {string} repoPath
|
|
80
|
+
* @returns {Set<string>} posix-style relative paths
|
|
81
|
+
*/
|
|
82
|
+
export function collectFiles(repoPath) {
|
|
83
|
+
const found = new Set();
|
|
84
|
+
const walk = (dir, rel, depth) => {
|
|
85
|
+
if (depth > MAX_DEPTH) return;
|
|
86
|
+
let entries;
|
|
87
|
+
try {
|
|
88
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
89
|
+
} catch {
|
|
90
|
+
return;
|
|
91
|
+
}
|
|
92
|
+
for (const entry of entries) {
|
|
93
|
+
const name = entry.name;
|
|
94
|
+
if (entry.isDirectory()) {
|
|
95
|
+
if (!SKIP_DIRS.has(name)) walk(join(dir, name), rel ? `${rel}/${name}` : name, depth + 1);
|
|
96
|
+
} else if (entry.isFile()) {
|
|
97
|
+
if (SKIP_FILES.has(name)) continue;
|
|
98
|
+
const relpath = rel ? `${rel}/${name}` : name;
|
|
99
|
+
try {
|
|
100
|
+
if (statSync(join(dir, name)).size > MAX_FILE_BYTES) continue;
|
|
101
|
+
} catch {
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
found.add(relpath);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
};
|
|
108
|
+
walk(repoPath, '', 0);
|
|
109
|
+
return found;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Scan a folder of sibling repos and compute drift clusters.
|
|
114
|
+
* @param {string} root
|
|
115
|
+
* @returns {{
|
|
116
|
+
* root: string,
|
|
117
|
+
* repos: string[],
|
|
118
|
+
* clusters: Array<ReturnType<typeof clusterStatus> & {members: Map<string, {hash: string, lines: string[]}>}>
|
|
119
|
+
* }}
|
|
120
|
+
*/
|
|
121
|
+
export function scan(root) {
|
|
122
|
+
const repos = listRepos(root);
|
|
123
|
+
if (repos.length < 2) {
|
|
124
|
+
throw new Error(
|
|
125
|
+
`need at least 2 git repos directly under ${root}, found ${repos.length}`
|
|
126
|
+
);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const repoPaths = new Map();
|
|
130
|
+
for (const repo of repos) repoPaths.set(repo.name, collectFiles(repo.path));
|
|
131
|
+
|
|
132
|
+
const keys = clusterPaths(repoPaths);
|
|
133
|
+
const repoNames = repos.map((r) => r.name);
|
|
134
|
+
const byName = new Map(repos.map((r) => [r.name, r.path]));
|
|
135
|
+
|
|
136
|
+
const clusters = [];
|
|
137
|
+
for (const relpath of keys) {
|
|
138
|
+
// Read and normalize content only for clustered files.
|
|
139
|
+
const members = new Map();
|
|
140
|
+
for (const repo of repoNames) {
|
|
141
|
+
if (!repoPaths.get(repo).has(relpath)) continue;
|
|
142
|
+
let buf;
|
|
143
|
+
try {
|
|
144
|
+
buf = readFileSync(join(byName.get(repo), ...relpath.split('/')));
|
|
145
|
+
} catch {
|
|
146
|
+
continue;
|
|
147
|
+
}
|
|
148
|
+
if (looksBinary(buf)) continue;
|
|
149
|
+
const lines = normalizeLines(buf.toString('utf8'));
|
|
150
|
+
const hash = createHash('sha256').update(lines.join('\n')).digest('hex');
|
|
151
|
+
members.set(repo, { hash, lines });
|
|
152
|
+
}
|
|
153
|
+
if (members.size === 0) continue;
|
|
154
|
+
clusters.push({ ...clusterStatus(relpath, members, repoNames), members });
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return { root, repos: repoNames, clusters };
|
|
158
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
// Line-based similarity between two normalized files.
|
|
2
|
+
//
|
|
3
|
+
// Primary metric: normalized diff ratio, 2 * LCS(a, b) / (|a| + |b|).
|
|
4
|
+
// This is the same family as difflib's ratio: 1.0 means identical line
|
|
5
|
+
// sequences, 0.0 means no line in common. It is order-sensitive, which is
|
|
6
|
+
// what you want for config files where reordering is a real change.
|
|
7
|
+
//
|
|
8
|
+
// For very large pairs (where the O(n*m) LCS table would be expensive) we
|
|
9
|
+
// fall back to a multiset Jaccard-style ratio, which is order-insensitive
|
|
10
|
+
// but linear time. Both return values in [0, 1].
|
|
11
|
+
|
|
12
|
+
const LCS_CELL_LIMIT = 4_000_000; // ~2000 x 2000 lines
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Intern lines into integers so comparisons are cheap.
|
|
16
|
+
* @param {string[]} a
|
|
17
|
+
* @param {string[]} b
|
|
18
|
+
* @returns {{a: Int32Array, b: Int32Array}}
|
|
19
|
+
*/
|
|
20
|
+
function intern(a, b) {
|
|
21
|
+
const table = new Map();
|
|
22
|
+
const enc = (line) => {
|
|
23
|
+
let id = table.get(line);
|
|
24
|
+
if (id === undefined) {
|
|
25
|
+
id = table.size;
|
|
26
|
+
table.set(line, id);
|
|
27
|
+
}
|
|
28
|
+
return id;
|
|
29
|
+
};
|
|
30
|
+
return {
|
|
31
|
+
a: Int32Array.from(a, enc),
|
|
32
|
+
b: Int32Array.from(b, enc)
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Length of the longest common subsequence of two line arrays.
|
|
38
|
+
* Dynamic programming with two rolling rows: O(n*m) time, O(min) space.
|
|
39
|
+
* @param {string[]} aLines
|
|
40
|
+
* @param {string[]} bLines
|
|
41
|
+
* @returns {number}
|
|
42
|
+
*/
|
|
43
|
+
export function lcsLength(aLines, bLines) {
|
|
44
|
+
if (aLines.length === 0 || bLines.length === 0) return 0;
|
|
45
|
+
const { a, b } = intern(aLines, bLines);
|
|
46
|
+
let prev = new Int32Array(b.length + 1);
|
|
47
|
+
let curr = new Int32Array(b.length + 1);
|
|
48
|
+
for (let i = 1; i <= a.length; i++) {
|
|
49
|
+
const ai = a[i - 1];
|
|
50
|
+
for (let j = 1; j <= b.length; j++) {
|
|
51
|
+
curr[j] = ai === b[j - 1] ? prev[j - 1] + 1 : Math.max(prev[j], curr[j - 1]);
|
|
52
|
+
}
|
|
53
|
+
[prev, curr] = [curr, prev];
|
|
54
|
+
}
|
|
55
|
+
return prev[b.length];
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Multiset Jaccard-style ratio: 2 * sum(min(countA, countB)) / (|a| + |b|).
|
|
60
|
+
* Order-insensitive, linear time. Used as the fallback for huge files.
|
|
61
|
+
* @param {string[]} aLines
|
|
62
|
+
* @param {string[]} bLines
|
|
63
|
+
* @returns {number} in [0, 1]
|
|
64
|
+
*/
|
|
65
|
+
export function multisetRatio(aLines, bLines) {
|
|
66
|
+
if (aLines.length === 0 && bLines.length === 0) return 1;
|
|
67
|
+
if (aLines.length === 0 || bLines.length === 0) return 0;
|
|
68
|
+
const counts = new Map();
|
|
69
|
+
for (const line of aLines) counts.set(line, (counts.get(line) ?? 0) + 1);
|
|
70
|
+
let common = 0;
|
|
71
|
+
for (const line of bLines) {
|
|
72
|
+
const c = counts.get(line) ?? 0;
|
|
73
|
+
if (c > 0) {
|
|
74
|
+
common++;
|
|
75
|
+
counts.set(line, c - 1);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return (2 * common) / (aLines.length + bLines.length);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Similarity between two normalized line arrays, in [0, 1].
|
|
83
|
+
* Uses the LCS diff ratio when the pair is small enough, otherwise the
|
|
84
|
+
* multiset ratio.
|
|
85
|
+
* @param {string[]} aLines
|
|
86
|
+
* @param {string[]} bLines
|
|
87
|
+
* @returns {number}
|
|
88
|
+
*/
|
|
89
|
+
export function similarity(aLines, bLines) {
|
|
90
|
+
if (aLines.length === 0 && bLines.length === 0) return 1;
|
|
91
|
+
if (aLines.length === 0 || bLines.length === 0) return 0;
|
|
92
|
+
if (aLines.length * bLines.length > LCS_CELL_LIMIT) {
|
|
93
|
+
return multisetRatio(aLines, bLines);
|
|
94
|
+
}
|
|
95
|
+
return (2 * lcsLength(aLines, bLines)) / (aLines.length + bLines.length);
|
|
96
|
+
}
|