aslopcleaner 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +154 -0
- package/dist/cli.mjs +118 -0
- package/dist/index.mjs +5 -0
- package/dist/scanner-ChvwTQMG.mjs +335 -0
- package/package.json +58 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2019 - 2026 Aron Homberg
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# aslopcleaner
|
|
2
|
+
|
|
3
|
+
High-performance CLI to normalize common LLM/AI Unicode punctuation and symbols into plain ASCII.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
- Recursively scans the current directory.
|
|
8
|
+
- Uses `fast-glob` to skip expensive third-party/build/cache directories early.
|
|
9
|
+
- Never opens `.env*`, SSH keys, certificate/key material, or password database files.
|
|
10
|
+
- Skips files larger than **125 KiB**.
|
|
11
|
+
- Uses a jump-sampled binary heuristic before reading as UTF-8.
|
|
12
|
+
- Prompts once per file in interactive mode.
|
|
13
|
+
- Replaces everything automatically in non-interactive mode with `-y`.
|
|
14
|
+
|
|
15
|
+
## Default replacements
|
|
16
|
+
|
|
17
|
+
### Dashes / bullets / arrows
|
|
18
|
+
|
|
19
|
+
- `—` => `-`
|
|
20
|
+
- `–` => `-`
|
|
21
|
+
- `‒` => `-`
|
|
22
|
+
- `―` => `--`
|
|
23
|
+
- `‐` => `-`
|
|
24
|
+
- `‑` => `-`
|
|
25
|
+
- `−` => `-`
|
|
26
|
+
- `→` => `=>`
|
|
27
|
+
- `⇒` => `=>`
|
|
28
|
+
- `⟶` => `=>`
|
|
29
|
+
- `➜` => `=>`
|
|
30
|
+
- `➔` => `=>`
|
|
31
|
+
- `➝` => `=>`
|
|
32
|
+
- `✔` => `-`
|
|
33
|
+
- `✅` => `-`
|
|
34
|
+
- `☑` => `-`
|
|
35
|
+
- `✓` => `-`
|
|
36
|
+
- `•` => `-`
|
|
37
|
+
- `‣` => `-`
|
|
38
|
+
- `◦` => `-`
|
|
39
|
+
- `▪` => `-`
|
|
40
|
+
- `·` => `-`
|
|
41
|
+
- `●` => `-`
|
|
42
|
+
- `○` => `-`
|
|
43
|
+
|
|
44
|
+
### Quotes / punctuation / spacing
|
|
45
|
+
|
|
46
|
+
- `“` => `"`
|
|
47
|
+
- `”` => `"`
|
|
48
|
+
- `„` => `"`
|
|
49
|
+
- `‟` => `"`
|
|
50
|
+
- `«` => `"`
|
|
51
|
+
- `»` => `"`
|
|
52
|
+
- `‹` => `'`
|
|
53
|
+
- `›` => `'`
|
|
54
|
+
- `‘` => `'`
|
|
55
|
+
- `’` => `'`
|
|
56
|
+
- `‚` => `'`
|
|
57
|
+
- `‛` => `'`
|
|
58
|
+
- `…` => `...`
|
|
59
|
+
- `≤` => `<=`
|
|
60
|
+
- `≥` => `>=`
|
|
61
|
+
- `≠` => `!=`
|
|
62
|
+
- NBSP / narrow NBSP / figure space => regular space
|
|
63
|
+
- zero-width space / joiner / BOM => removed
|
|
64
|
+
|
|
65
|
+
## Run it
|
|
66
|
+
|
|
67
|
+
### NPM
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
npx aslopcleaner
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Bun
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
bunx aslopcleaner
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### PNPM
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pnpx aslopcleaner
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Yarn
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
yarn dlx aslopcleaner
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Local testing
|
|
92
|
+
|
|
93
|
+
#### Node
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
node dist/cli.mjs
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Bun
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
bun run dist/cli.mjs
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Library usage
|
|
106
|
+
|
|
107
|
+
You can also import `aslopcleaner` as a library to integrate Unicode normalization into your own tools:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
bun add aslopcleaner
|
|
111
|
+
pnpm install aslopcleaner
|
|
112
|
+
yarn add aslopcleaner
|
|
113
|
+
npm install aslopcleaner
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
```ts
|
|
117
|
+
import {
|
|
118
|
+
findOccurrences,
|
|
119
|
+
applyOccurrences,
|
|
120
|
+
countByMatch,
|
|
121
|
+
scanDirectory,
|
|
122
|
+
isProbablyBinary,
|
|
123
|
+
shouldSkipSensitivePath,
|
|
124
|
+
REPLACEMENT_RULES,
|
|
125
|
+
REPLACEMENT_RULE_MAP,
|
|
126
|
+
} from "aslopcleaner";
|
|
127
|
+
|
|
128
|
+
// Scan a directory for files containing Unicode slop
|
|
129
|
+
const { files, totalOccurrences } = await scanDirectory(process.cwd());
|
|
130
|
+
|
|
131
|
+
// Find occurrences in a string
|
|
132
|
+
const content = '"Hello" → world…';
|
|
133
|
+
const matches = findOccurrences(content, REPLACEMENT_RULES);
|
|
134
|
+
|
|
135
|
+
// Apply replacements
|
|
136
|
+
const cleaned = applyOccurrences(content, matches, REPLACEMENT_RULE_MAP);
|
|
137
|
+
// => '"Hello" => world...'
|
|
138
|
+
|
|
139
|
+
// Count occurrences per symbol
|
|
140
|
+
const counts = countByMatch(matches);
|
|
141
|
+
// => Map { '"' => 1, '"' => 1, '→' => 1, '…' => 1 }
|
|
142
|
+
|
|
143
|
+
// Check if a file is binary (skip before processing)
|
|
144
|
+
const binary = await isProbablyBinary("image.png"); // true
|
|
145
|
+
|
|
146
|
+
// Check if a path is sensitive (e.g. .env, SSH keys)
|
|
147
|
+
shouldSkipSensitivePath(".env.production"); // true
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Exported types
|
|
151
|
+
|
|
152
|
+
```ts
|
|
153
|
+
import type { ReplacementRule, MatchOccurrence, ScanResult } from "aslopcleaner";
|
|
154
|
+
```
|
package/dist/cli.mjs
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import process from 'node:process';
|
|
4
|
+
import { readFile, writeFile } from 'node:fs/promises';
|
|
5
|
+
import * as readline from 'node:readline/promises';
|
|
6
|
+
import { R as REPLACEMENT_RULES, s as scanDirectory, d as shouldSkipSensitivePath, b as applyOccurrences, c as countByMatch, a as REPLACEMENT_RULE_MAP } from './scanner-ChvwTQMG.mjs';
|
|
7
|
+
import 'fast-glob';
|
|
8
|
+
|
|
9
|
+
function parseArgs(argv) {
|
|
10
|
+
const flags = new Set(argv);
|
|
11
|
+
return {
|
|
12
|
+
yes: flags.has("-y") || flags.has("--yes"),
|
|
13
|
+
help: flags.has("-h") || flags.has("--help")
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
function printHelp() {
|
|
17
|
+
console.log(`aslopcleaner
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
aslopcleaner [-y]
|
|
21
|
+
|
|
22
|
+
Options:
|
|
23
|
+
-y, --yes Replace every detected occurrence without prompting
|
|
24
|
+
-h, --help Show help
|
|
25
|
+
`);
|
|
26
|
+
}
|
|
27
|
+
function formatCounts(matches) {
|
|
28
|
+
const counts = countByMatch(matches);
|
|
29
|
+
const parts = [...counts.entries()].sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).map(([match, count]) => {
|
|
30
|
+
const replacement = REPLACEMENT_RULE_MAP.get(match) ?? "?";
|
|
31
|
+
return `${JSON.stringify(match)}=>${JSON.stringify(replacement)} x${count}`;
|
|
32
|
+
});
|
|
33
|
+
return parts.join(", ");
|
|
34
|
+
}
|
|
35
|
+
async function promptReplace(rl, filePath, matches) {
|
|
36
|
+
console.log(`
|
|
37
|
+
${filePath}`);
|
|
38
|
+
console.log(` ${matches.length} occurrence(s): ${formatCounts(matches)}`);
|
|
39
|
+
while (true) {
|
|
40
|
+
const answer = (await rl.question(" Replace and overwrite? [y/n] ")).trim().toLowerCase();
|
|
41
|
+
if (answer === "y") {
|
|
42
|
+
return true;
|
|
43
|
+
}
|
|
44
|
+
if (answer === "n") {
|
|
45
|
+
return false;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
async function replaceFile(root, relativePath, matches) {
|
|
50
|
+
if (shouldSkipSensitivePath(relativePath)) {
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
const absolutePath = path.join(root, relativePath);
|
|
54
|
+
const original = await readFile(absolutePath, "utf8");
|
|
55
|
+
const updated = applyOccurrences(original, matches, REPLACEMENT_RULE_MAP);
|
|
56
|
+
if (updated === original) {
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
await writeFile(absolutePath, updated, "utf8");
|
|
60
|
+
return true;
|
|
61
|
+
}
|
|
62
|
+
async function main() {
|
|
63
|
+
const args = parseArgs(process.argv.slice(2));
|
|
64
|
+
if (args.help) {
|
|
65
|
+
printHelp();
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
if (!args.yes && (!process.stdin.isTTY || !process.stdout.isTTY)) {
|
|
69
|
+
console.error("Interactive mode requires a TTY. Use -y for non-interactive replacement.");
|
|
70
|
+
process.exitCode = 1;
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
const cwd = process.cwd();
|
|
74
|
+
console.log(`Scanning ${cwd}`);
|
|
75
|
+
console.log(`Loaded ${REPLACEMENT_RULES.length} ASCII normalization rule(s).`);
|
|
76
|
+
const scan = await scanDirectory(cwd);
|
|
77
|
+
const entries = [...scan.matchesByFile.entries()].sort((a, b) => a[0].localeCompare(b[0]));
|
|
78
|
+
if (entries.length === 0) {
|
|
79
|
+
console.log("No matching Unicode characters found.");
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
const rl = args.yes ? null : readline.createInterface({
|
|
83
|
+
input: process.stdin,
|
|
84
|
+
output: process.stdout
|
|
85
|
+
});
|
|
86
|
+
let updatedFiles = 0;
|
|
87
|
+
let replacedOccurrences = 0;
|
|
88
|
+
try {
|
|
89
|
+
for (const [filePath, matches] of entries) {
|
|
90
|
+
const shouldReplace = args.yes || rl !== null && await promptReplace(rl, filePath, matches);
|
|
91
|
+
if (!shouldReplace) {
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
const changed = await replaceFile(cwd, filePath, matches);
|
|
95
|
+
if (!changed) {
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
updatedFiles += 1;
|
|
99
|
+
replacedOccurrences += matches.length;
|
|
100
|
+
console.log(` updated ${filePath}`);
|
|
101
|
+
}
|
|
102
|
+
} finally {
|
|
103
|
+
await rl?.close();
|
|
104
|
+
}
|
|
105
|
+
console.log("\nDone.");
|
|
106
|
+
console.log(` files with matches: ${entries.length}`);
|
|
107
|
+
console.log(` files updated: ${updatedFiles}`);
|
|
108
|
+
console.log(` occurrences replaced: ${replacedOccurrences}`);
|
|
109
|
+
console.log(` files scanned after glob filtering: ${scan.scannedFiles}`);
|
|
110
|
+
console.log(` skipped by sensitive path rules: ${scan.skippedBySensitivePattern}`);
|
|
111
|
+
console.log(` skipped by size (>125 KiB): ${scan.skippedBySize}`);
|
|
112
|
+
console.log(` skipped as binary: ${scan.skippedByBinary}`);
|
|
113
|
+
}
|
|
114
|
+
main().catch((error) => {
|
|
115
|
+
const message = error instanceof Error ? error.stack ?? error.message : String(error);
|
|
116
|
+
console.error(message);
|
|
117
|
+
process.exitCode = 1;
|
|
118
|
+
});
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
export { F as FAST_GLOB_IGNORE_PATTERNS, M as MAX_FILE_SIZE_BYTES, R as REPLACEMENT_RULES, a as REPLACEMENT_RULE_MAP, b as applyOccurrences, c as countByMatch, f as findOccurrences, i as isProbablyBinary, n as normalizeGlobPath, s as scanDirectory, d as shouldSkipSensitivePath } from './scanner-ChvwTQMG.mjs';
|
|
3
|
+
import 'fast-glob';
|
|
4
|
+
import 'node:path';
|
|
5
|
+
import 'node:fs/promises';
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
import fg from 'fast-glob';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { open, stat, readFile } from 'node:fs/promises';
|
|
4
|
+
|
|
5
|
+
const SKIPPED_DIRECTORIES = [
|
|
6
|
+
".git",
|
|
7
|
+
".hg",
|
|
8
|
+
".svn",
|
|
9
|
+
".idea",
|
|
10
|
+
".vscode",
|
|
11
|
+
"node_modules",
|
|
12
|
+
"bower_components",
|
|
13
|
+
"vendor",
|
|
14
|
+
".pnpm-store",
|
|
15
|
+
".yarn",
|
|
16
|
+
".turbo",
|
|
17
|
+
".cache",
|
|
18
|
+
".parcel-cache",
|
|
19
|
+
".vite",
|
|
20
|
+
".rollup.cache",
|
|
21
|
+
".rspack-cache",
|
|
22
|
+
".eslintcache",
|
|
23
|
+
".stylelintcache",
|
|
24
|
+
".next",
|
|
25
|
+
".nuxt",
|
|
26
|
+
".svelte-kit",
|
|
27
|
+
".angular",
|
|
28
|
+
".output",
|
|
29
|
+
"dist",
|
|
30
|
+
"build",
|
|
31
|
+
"out",
|
|
32
|
+
"target",
|
|
33
|
+
"bin",
|
|
34
|
+
"obj",
|
|
35
|
+
"coverage",
|
|
36
|
+
".nyc_output",
|
|
37
|
+
"playwright-report",
|
|
38
|
+
"test-results",
|
|
39
|
+
"__pycache__",
|
|
40
|
+
".venv",
|
|
41
|
+
"venv",
|
|
42
|
+
"env",
|
|
43
|
+
".conda",
|
|
44
|
+
".mypy_cache",
|
|
45
|
+
".pytest_cache",
|
|
46
|
+
".ruff_cache",
|
|
47
|
+
".tox",
|
|
48
|
+
".eggs",
|
|
49
|
+
".gradle",
|
|
50
|
+
".mvn",
|
|
51
|
+
".settings",
|
|
52
|
+
".bundle",
|
|
53
|
+
"_build",
|
|
54
|
+
"deps",
|
|
55
|
+
"dist-newstyle",
|
|
56
|
+
".stack-work",
|
|
57
|
+
"storybook-static",
|
|
58
|
+
"tmp",
|
|
59
|
+
"temp",
|
|
60
|
+
"logs",
|
|
61
|
+
"log"
|
|
62
|
+
];
|
|
63
|
+
const SENSITIVE_EXACT_BASENAMES = /* @__PURE__ */ new Set([
|
|
64
|
+
"id_rsa",
|
|
65
|
+
"id_ed25519",
|
|
66
|
+
"known_hosts",
|
|
67
|
+
"authorized_keys",
|
|
68
|
+
".npmrc",
|
|
69
|
+
".pypirc",
|
|
70
|
+
".netrc"
|
|
71
|
+
]);
|
|
72
|
+
const SENSITIVE_SUFFIXES = [
|
|
73
|
+
".pem",
|
|
74
|
+
".key",
|
|
75
|
+
".p12",
|
|
76
|
+
".pfx",
|
|
77
|
+
".crt",
|
|
78
|
+
".cer",
|
|
79
|
+
".der",
|
|
80
|
+
".csr",
|
|
81
|
+
".p7b",
|
|
82
|
+
".p7c",
|
|
83
|
+
".jks",
|
|
84
|
+
".keystore",
|
|
85
|
+
".asc",
|
|
86
|
+
".gpg",
|
|
87
|
+
".kdbx"
|
|
88
|
+
];
|
|
89
|
+
const MAX_FILE_SIZE_BYTES = 125 * 1024;
|
|
90
|
+
const FAST_GLOB_IGNORE_PATTERNS = SKIPPED_DIRECTORIES.flatMap((directory) => [
|
|
91
|
+
`${directory}/**`,
|
|
92
|
+
`**/${directory}/**`
|
|
93
|
+
]);
|
|
94
|
+
function shouldSkipSensitivePath(filePath) {
|
|
95
|
+
const baseName = path.basename(filePath).toLowerCase();
|
|
96
|
+
if (baseName === ".env" || baseName.startsWith(".env.")) {
|
|
97
|
+
return true;
|
|
98
|
+
}
|
|
99
|
+
if (SENSITIVE_EXACT_BASENAMES.has(baseName)) {
|
|
100
|
+
return true;
|
|
101
|
+
}
|
|
102
|
+
return SENSITIVE_SUFFIXES.some((suffix) => baseName.endsWith(suffix));
|
|
103
|
+
}
|
|
104
|
+
function normalizeGlobPath(filePath) {
|
|
105
|
+
return filePath.split(path.sep).join("/");
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function findOccurrences(content, rules) {
|
|
109
|
+
const matches = [];
|
|
110
|
+
for (const rule of rules) {
|
|
111
|
+
let index = content.indexOf(rule.match);
|
|
112
|
+
if (index === -1) {
|
|
113
|
+
continue;
|
|
114
|
+
}
|
|
115
|
+
while (index !== -1) {
|
|
116
|
+
matches.push({ index, match: rule.match });
|
|
117
|
+
index = content.indexOf(rule.match, index + rule.match.length);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
matches.sort((a, b) => a.index - b.index || a.match.length - b.match.length);
|
|
121
|
+
return matches;
|
|
122
|
+
}
|
|
123
|
+
function countByMatch(matches) {
|
|
124
|
+
const counts = /* @__PURE__ */ new Map();
|
|
125
|
+
for (const match of matches) {
|
|
126
|
+
counts.set(match.match, (counts.get(match.match) ?? 0) + 1);
|
|
127
|
+
}
|
|
128
|
+
return counts;
|
|
129
|
+
}
|
|
130
|
+
function applyOccurrences(content, matches, replacements) {
|
|
131
|
+
if (matches.length === 0) {
|
|
132
|
+
return content;
|
|
133
|
+
}
|
|
134
|
+
let cursor = 0;
|
|
135
|
+
let output = "";
|
|
136
|
+
for (const occurrence of matches) {
|
|
137
|
+
if (occurrence.index < cursor) {
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
140
|
+
const replacement = replacements.get(occurrence.match);
|
|
141
|
+
if (replacement === void 0) {
|
|
142
|
+
continue;
|
|
143
|
+
}
|
|
144
|
+
output += content.slice(cursor, occurrence.index);
|
|
145
|
+
output += replacement;
|
|
146
|
+
cursor = occurrence.index + occurrence.match.length;
|
|
147
|
+
}
|
|
148
|
+
output += content.slice(cursor);
|
|
149
|
+
return output;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const REPLACEMENT_RULES = [
|
|
153
|
+
{ match: "\u2014", replacement: "-", description: "em dash" },
|
|
154
|
+
{ match: "\u2013", replacement: "-", description: "en dash" },
|
|
155
|
+
{ match: "\u2012", replacement: "-", description: "figure dash" },
|
|
156
|
+
{ match: "\u2015", replacement: "--", description: "horizontal bar" },
|
|
157
|
+
{ match: "\u2010", replacement: "-", description: "hyphen" },
|
|
158
|
+
{ match: "\u2011", replacement: "-", description: "non-breaking hyphen" },
|
|
159
|
+
{ match: "\u2212", replacement: "-", description: "minus sign" },
|
|
160
|
+
{ match: "\u2192", replacement: "=>", description: "right arrow" },
|
|
161
|
+
{ match: "\u21D2", replacement: "=>", description: "double right arrow" },
|
|
162
|
+
{ match: "\u27F6", replacement: "=>", description: "long right arrow" },
|
|
163
|
+
{ match: "\u279C", replacement: "=>", description: "heavy right arrow" },
|
|
164
|
+
{ match: "\u2794", replacement: "=>", description: "black right arrow" },
|
|
165
|
+
{ match: "\u279D", replacement: "=>", description: "drafting right arrow" },
|
|
166
|
+
{ match: "\u2714", replacement: "-", description: "heavy check mark" },
|
|
167
|
+
{ match: "\u2705", replacement: "-", description: "check mark button" },
|
|
168
|
+
{ match: "\u2611", replacement: "-", description: "ballot box with check" },
|
|
169
|
+
{ match: "\u2713", replacement: "-", description: "check mark" },
|
|
170
|
+
{ match: "\u2022", replacement: "-", description: "bullet" },
|
|
171
|
+
{ match: "\u2023", replacement: "-", description: "triangular bullet" },
|
|
172
|
+
{ match: "\u25E6", replacement: "-", description: "white bullet" },
|
|
173
|
+
{ match: "\u25AA", replacement: "-", description: "small square bullet" },
|
|
174
|
+
{ match: "\xB7", replacement: "-", description: "middle dot bullet" },
|
|
175
|
+
{ match: "\u25CF", replacement: "-", description: "black circle bullet" },
|
|
176
|
+
{ match: "\u25CB", replacement: "-", description: "white circle bullet" },
|
|
177
|
+
{ match: "\u201C", replacement: '"', description: "left double quote" },
|
|
178
|
+
{ match: "\u201D", replacement: '"', description: "right double quote" },
|
|
179
|
+
{ match: "\u201E", replacement: '"', description: "low double quote" },
|
|
180
|
+
{ match: "\u201F", replacement: '"', description: "double high-reversed-9 quote" },
|
|
181
|
+
{ match: "\xAB", replacement: '"', description: "left guillemet" },
|
|
182
|
+
{ match: "\xBB", replacement: '"', description: "right guillemet" },
|
|
183
|
+
{ match: "\u2039", replacement: "'", description: "left single guillemet" },
|
|
184
|
+
{ match: "\u203A", replacement: "'", description: "right single guillemet" },
|
|
185
|
+
{ match: "\u2018", replacement: "'", description: "left single quote" },
|
|
186
|
+
{ match: "\u2019", replacement: "'", description: "right single quote / apostrophe" },
|
|
187
|
+
{ match: "\u201A", replacement: "'", description: "low single quote" },
|
|
188
|
+
{ match: "\u201B", replacement: "'", description: "single high-reversed-9 quote" },
|
|
189
|
+
{ match: "\u2026", replacement: "...", description: "ellipsis" },
|
|
190
|
+
{ match: "\u2264", replacement: "<=", description: "less-than-or-equal" },
|
|
191
|
+
{ match: "\u2265", replacement: ">=", description: "greater-than-or-equal" },
|
|
192
|
+
{ match: "\u2260", replacement: "!=", description: "not-equal" },
|
|
193
|
+
{ match: "\xA0", replacement: " ", description: "no-break space" },
|
|
194
|
+
{ match: "\u202F", replacement: " ", description: "narrow no-break space" },
|
|
195
|
+
{ match: "\u2007", replacement: " ", description: "figure space" },
|
|
196
|
+
{ match: "\u200B", replacement: "", description: "zero-width space" },
|
|
197
|
+
{ match: "\u200C", replacement: "", description: "zero-width non-joiner" },
|
|
198
|
+
{ match: "\u200D", replacement: "", description: "zero-width joiner" },
|
|
199
|
+
{ match: "\u2060", replacement: "", description: "word joiner" },
|
|
200
|
+
{ match: "\uFEFF", replacement: "", description: "byte-order mark" }
|
|
201
|
+
];
|
|
202
|
+
const REPLACEMENT_RULE_MAP = new Map(
|
|
203
|
+
REPLACEMENT_RULES.map((rule) => [rule.match, rule.replacement])
|
|
204
|
+
);
|
|
205
|
+
|
|
206
|
+
const SAMPLE_SIZE = 1024;
|
|
207
|
+
const MAX_SAMPLES = 5;
|
|
208
|
+
const SUSPICIOUS_RATIO = 0.15;
|
|
209
|
+
function getSampleOffsets(size) {
|
|
210
|
+
if (size <= SAMPLE_SIZE) {
|
|
211
|
+
return [0];
|
|
212
|
+
}
|
|
213
|
+
const maxOffset = Math.max(0, size - SAMPLE_SIZE);
|
|
214
|
+
const offsets = /* @__PURE__ */ new Set([0, maxOffset]);
|
|
215
|
+
for (let i = 1; i < MAX_SAMPLES - 1; i += 1) {
|
|
216
|
+
offsets.add(Math.floor(maxOffset * i / (MAX_SAMPLES - 1)));
|
|
217
|
+
}
|
|
218
|
+
return [...offsets].sort((a, b) => a - b);
|
|
219
|
+
}
|
|
220
|
+
function scoreSample(buffer) {
|
|
221
|
+
let suspicious = 0;
|
|
222
|
+
let total = 0;
|
|
223
|
+
for (const byte of buffer) {
|
|
224
|
+
if (byte === 0) {
|
|
225
|
+
return 1;
|
|
226
|
+
}
|
|
227
|
+
total += 1;
|
|
228
|
+
const isAllowedControl = byte === 9 || byte === 10 || byte === 13 || byte === 12;
|
|
229
|
+
if (byte < 32 && !isAllowedControl || byte === 127) {
|
|
230
|
+
suspicious += 1;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
return total === 0 ? 0 : suspicious / total;
|
|
234
|
+
}
|
|
235
|
+
async function isProbablyBinary(filePath) {
|
|
236
|
+
const handle = await open(filePath, "r");
|
|
237
|
+
try {
|
|
238
|
+
const stats = await handle.stat();
|
|
239
|
+
const offsets = getSampleOffsets(stats.size);
|
|
240
|
+
const sampleBuffer = Buffer.allocUnsafe(SAMPLE_SIZE);
|
|
241
|
+
for (const offset of offsets) {
|
|
242
|
+
const { bytesRead } = await handle.read(sampleBuffer, 0, SAMPLE_SIZE, offset);
|
|
243
|
+
if (bytesRead === 0) {
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
const score = scoreSample(sampleBuffer.subarray(0, bytesRead));
|
|
247
|
+
if (score >= SUSPICIOUS_RATIO) {
|
|
248
|
+
return true;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
return false;
|
|
252
|
+
} finally {
|
|
253
|
+
await handle.close();
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const DEFAULT_CONCURRENCY = 64;
|
|
258
|
+
async function processFile(cwd, relativePath) {
|
|
259
|
+
const normalizedPath = normalizeGlobPath(relativePath);
|
|
260
|
+
if (shouldSkipSensitivePath(normalizedPath)) {
|
|
261
|
+
return { kind: "sensitive" };
|
|
262
|
+
}
|
|
263
|
+
const absolutePath = path.join(cwd, relativePath);
|
|
264
|
+
const fileStat = await stat(absolutePath);
|
|
265
|
+
if (fileStat.size > MAX_FILE_SIZE_BYTES) {
|
|
266
|
+
return { kind: "size" };
|
|
267
|
+
}
|
|
268
|
+
if (await isProbablyBinary(absolutePath)) {
|
|
269
|
+
return { kind: "binary" };
|
|
270
|
+
}
|
|
271
|
+
const content = await readFile(absolutePath, "utf8");
|
|
272
|
+
const matches = findOccurrences(content, REPLACEMENT_RULES);
|
|
273
|
+
if (matches.length === 0) {
|
|
274
|
+
return { kind: "none" };
|
|
275
|
+
}
|
|
276
|
+
return {
|
|
277
|
+
path: normalizedPath,
|
|
278
|
+
matches
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
async function scanDirectory(cwd) {
|
|
282
|
+
const matchesByFile = /* @__PURE__ */ new Map();
|
|
283
|
+
let scannedFiles = 0;
|
|
284
|
+
let skippedBySensitivePattern = 0;
|
|
285
|
+
let skippedBySize = 0;
|
|
286
|
+
let skippedByBinary = 0;
|
|
287
|
+
const stream = fg.stream("**/*", {
|
|
288
|
+
cwd,
|
|
289
|
+
onlyFiles: true,
|
|
290
|
+
dot: true,
|
|
291
|
+
followSymbolicLinks: false,
|
|
292
|
+
unique: true,
|
|
293
|
+
ignore: [...FAST_GLOB_IGNORE_PATTERNS]
|
|
294
|
+
});
|
|
295
|
+
const inFlight = /* @__PURE__ */ new Set();
|
|
296
|
+
const schedule = async (relativePath) => {
|
|
297
|
+
scannedFiles += 1;
|
|
298
|
+
try {
|
|
299
|
+
const result = await processFile(cwd, relativePath);
|
|
300
|
+
if ("kind" in result) {
|
|
301
|
+
if (result.kind === "sensitive") {
|
|
302
|
+
skippedBySensitivePattern += 1;
|
|
303
|
+
} else if (result.kind === "size") {
|
|
304
|
+
skippedBySize += 1;
|
|
305
|
+
} else if (result.kind === "binary") {
|
|
306
|
+
skippedByBinary += 1;
|
|
307
|
+
}
|
|
308
|
+
return;
|
|
309
|
+
}
|
|
310
|
+
matchesByFile.set(result.path, result.matches);
|
|
311
|
+
} catch {
|
|
312
|
+
}
|
|
313
|
+
};
|
|
314
|
+
for await (const entry of stream) {
|
|
315
|
+
const relativePath = String(entry);
|
|
316
|
+
const task = schedule(relativePath).finally(() => {
|
|
317
|
+
inFlight.delete(task);
|
|
318
|
+
});
|
|
319
|
+
inFlight.add(task);
|
|
320
|
+
if (inFlight.size >= DEFAULT_CONCURRENCY) {
|
|
321
|
+
await Promise.race(inFlight);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
await Promise.all(inFlight);
|
|
325
|
+
return {
|
|
326
|
+
matchesByFile,
|
|
327
|
+
scannedFiles,
|
|
328
|
+
skippedByGlob: 0,
|
|
329
|
+
skippedBySensitivePattern,
|
|
330
|
+
skippedBySize,
|
|
331
|
+
skippedByBinary
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
export { FAST_GLOB_IGNORE_PATTERNS as F, MAX_FILE_SIZE_BYTES as M, REPLACEMENT_RULES as R, REPLACEMENT_RULE_MAP as a, applyOccurrences as b, countByMatch as c, shouldSkipSensitivePath as d, findOccurrences as f, isProbablyBinary as i, normalizeGlobPath as n, scanDirectory as s };
|
package/package.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "aslopcleaner",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "High-performance CLI to replace common LLM/AI Unicode punctuation and symbols with ASCII equivalents.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"aslopcleaner": "./dist/cli.mjs"
|
|
8
|
+
},
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": "./dist/index.mjs"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"repository": {
|
|
15
|
+
"type": "git",
|
|
16
|
+
"url": "git+https://github.com/kyr0/defuss.git"
|
|
17
|
+
},
|
|
18
|
+
"author": "Aron Homberg <info@aron-homberg.de>",
|
|
19
|
+
"homepage": "https://github.com/kyr0/defuss",
|
|
20
|
+
"publishConfig": {
|
|
21
|
+
"access": "public"
|
|
22
|
+
},
|
|
23
|
+
"packageManager": "bun@1.3.9",
|
|
24
|
+
"sideEffects": false,
|
|
25
|
+
"files": [
|
|
26
|
+
"dist",
|
|
27
|
+
"README.md",
|
|
28
|
+
"LICENSE"
|
|
29
|
+
],
|
|
30
|
+
"scripts": {
|
|
31
|
+
"build": "rm -rf dist && pkgroll",
|
|
32
|
+
"start": "node dist/cli.mjs",
|
|
33
|
+
"dev": "node --enable-source-maps src/cli.ts",
|
|
34
|
+
"test": "vitest run",
|
|
35
|
+
"test:coverage": "vitest run --coverage",
|
|
36
|
+
"run:example": "tsx example/example.ts"
|
|
37
|
+
},
|
|
38
|
+
"keywords": [
|
|
39
|
+
"ascii",
|
|
40
|
+
"bun",
|
|
41
|
+
"node",
|
|
42
|
+
"cli",
|
|
43
|
+
"unicode",
|
|
44
|
+
"sanitizer",
|
|
45
|
+
"llm"
|
|
46
|
+
],
|
|
47
|
+
"license": "MIT",
|
|
48
|
+
"dependencies": {
|
|
49
|
+
"fast-glob": "^3.3.3"
|
|
50
|
+
},
|
|
51
|
+
"devDependencies": {
|
|
52
|
+
"@types/node": "^25.5.0",
|
|
53
|
+
"@vitest/coverage-v8": "^3.1.1",
|
|
54
|
+
"pkgroll": "^2.11.3",
|
|
55
|
+
"tsx": "^4.19.4",
|
|
56
|
+
"vitest": "^3.1.1"
|
|
57
|
+
}
|
|
58
|
+
}
|