easyen 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.EN.md +22 -0
- package/README.md +20 -0
- package/SKILL.md +106 -0
- package/dist/base-form.d.ts +6 -0
- package/dist/base-form.js +98 -0
- package/dist/classify.d.ts +22 -0
- package/dist/classify.js +50 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +108 -0
- package/dist/coverage.d.ts +64 -0
- package/dist/coverage.js +120 -0
- package/dist/dictionaries/academic.json +1 -0
- package/dist/dictionaries/everyday.json +1 -0
- package/dist/dictionaries/frameworks.json +1 -0
- package/dist/dictionaries/index.d.ts +19 -0
- package/dist/dictionaries/index.js +49 -0
- package/dist/dictionaries/tech.json +1 -0
- package/dist/dictionary.d.ts +8 -0
- package/dist/dictionary.js +30 -0
- package/dist/extract.d.ts +14 -0
- package/dist/extract.js +37 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.js +53 -0
- package/dist/irregulars.d.ts +11 -0
- package/dist/irregulars.js +108 -0
- package/dist/normalize.d.ts +11 -0
- package/dist/normalize.js +38 -0
- package/dist/pipe.d.ts +8 -0
- package/dist/pipe.js +13 -0
- package/dist/sentences.d.ts +23 -0
- package/dist/sentences.js +36 -0
- package/package.json +75 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 zhangxiangliang
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.EN.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# easyen
|
|
2
|
+
|
|
3
|
+
[English](README.EN.md) · [中文](README.md)
|
|
4
|
+
|
|
5
|
+
Ever notice how AI likes to dress English up? It says simple things in hard
|
|
6
|
+
words, and gives one idea three different ways — tiring to read. It is not the
|
|
7
|
+
AI's fault; it is just how it was trained, and it does not feel it.
|
|
8
|
+
|
|
9
|
+
easyen checks two things in the AI's English, and reminds it to keep things
|
|
10
|
+
simple:
|
|
11
|
+
|
|
12
|
+
* **Words** — which words are too hard, picked out for you.
|
|
13
|
+
* **Sentences** — which sentences are too long, picked out for you.
|
|
14
|
+
|
|
15
|
+
It only measures; it changes nothing. It shows you the hard words and the long
|
|
16
|
+
sentences clearly. Whether to change them, and how — the AI decides.
|
|
17
|
+
|
|
18
|
+
## How to use
|
|
19
|
+
|
|
20
|
+
Hand this one line to your AI (like Claude Code), and it does the rest:
|
|
21
|
+
|
|
22
|
+
> Read and follow https://github.com/zhangxiangliang/easyen/blob/main/SKILL.md
|
package/README.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# easyen
|
|
2
|
+
|
|
3
|
+
[English](README.EN.md) · [中文](README.md)
|
|
4
|
+
|
|
5
|
+
你有没有发现,让 AI 写英文,它总爱端着——简单的意思非说得文绉绉,一个意思还换着好几种说法,读起来累人。
|
|
6
|
+
这不怪它,是训练出来的习惯,它自己没感觉。
|
|
7
|
+
|
|
8
|
+
easyen 就帮 AI 量两件事,提醒它把话说简单点:
|
|
9
|
+
|
|
10
|
+
* **词汇** —— 哪些词超纲了,挑出来。
|
|
11
|
+
* **句子** —— 哪些句子太长了,挑出来。
|
|
12
|
+
|
|
13
|
+
它只量、不改。哪些词偏难、哪些句子绕,给你标得清清楚楚;至于换不换、怎么换,
|
|
14
|
+
AI 自己拿主意。
|
|
15
|
+
|
|
16
|
+
## 如何使用
|
|
17
|
+
|
|
18
|
+
把下面这句话丢给你的 AI(比如 Claude Code),剩下的它自己搞定:
|
|
19
|
+
|
|
20
|
+
> Read and follow https://github.com/zhangxiangliang/easyen/blob/main/SKILL.md
|
package/SKILL.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: easyen
|
|
3
|
+
description: Check how easy an English text is to read — its word level and its sentence length. Use it when writing or checking English that should stay simple (for learners), to find hard words to change and long sentences to break up, or to keep your own English at a chosen level.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# easyen
|
|
7
|
+
|
|
8
|
+
See how easy a piece of English is to read, by running `npx easyen`.
|
|
9
|
+
It gives you two simple signals — you decide what to change:
|
|
10
|
+
|
|
11
|
+
1. **Words** — how many of the words are in a chosen word list, and which words
|
|
12
|
+
are too hard.
|
|
13
|
+
2. **Sentences** — the average sentence length, and which sentences are long.
|
|
14
|
+
|
|
15
|
+
easyen only reports. It does not change your text. You decide which hard words
|
|
16
|
+
to make simpler and which long sentences to break up.
|
|
17
|
+
|
|
18
|
+
## When to use
|
|
19
|
+
|
|
20
|
+
- You wrote some English and want it to stay simple for the reader.
|
|
21
|
+
- You want to find the hard words or long sentences in a text.
|
|
22
|
+
- You want to keep your writing at a level (everyday, academic, or technical).
|
|
23
|
+
|
|
24
|
+
## How to run
|
|
25
|
+
|
|
26
|
+
No install needed — `npx easyen` gets the package on first run. It reads text
|
|
27
|
+
from stdin and prints JSON. Pipe the text in, using whatever your shell offers
|
|
28
|
+
(this follows the Unix convention, and works on macOS, Linux, and Windows
|
|
29
|
+
PowerShell / cmd). `everyday` is the default word list.
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
cat your-doc.md | npx easyen # macOS / Linux
|
|
33
|
+
Get-Content your-doc.md | npx easyen # Windows PowerShell
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
If piping is hard on your system, read a file directly instead:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
npx easyen --file your-doc.md --dict everyday,tech
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
You can also compose with other tools — e.g. strip code first so it does not
|
|
43
|
+
count as hard words: `<remove code> | npx easyen --dict everyday,tech`.
|
|
44
|
+
|
|
45
|
+
### Choose a word list by reader
|
|
46
|
+
|
|
47
|
+
`everyday` is the base. The others are **add-ons** — combine them onto everyday,
|
|
48
|
+
like `everyday,academic`; each one only adds its own extra words.
|
|
49
|
+
|
|
50
|
+
- `everyday` — the base, about 2800 common words. Use it alone for the simplest level.
|
|
51
|
+
- `academic` — academic words. `--dict everyday,academic`
|
|
52
|
+
- `tech` — software words (api, deploy, schema ...). `--dict everyday,tech`
|
|
53
|
+
- `frameworks` — tool and language names (vue, vite, webpack, docker ...).
|
|
54
|
+
`--dict everyday,tech,frameworks` for a frontend or server text.
|
|
55
|
+
|
|
56
|
+
Use only what the reader needs — a smaller list marks more words to change. You
|
|
57
|
+
can also add your own word file (one word per line), e.g. your team's product
|
|
58
|
+
names:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
cat your-doc.md | npx easyen --dict everyday,tech,./our-product-names.txt
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Reading the result
|
|
65
|
+
|
|
66
|
+
The JSON has:
|
|
67
|
+
|
|
68
|
+
- `ratio` — 0 to 1, how much of the text is in the word list (higher = easier).
|
|
69
|
+
- `hardWords` — words not in the list, sorted A–Z. These are the ones to change.
|
|
70
|
+
- `hardWordCounts` — the same words with how many times each one appears, most
|
|
71
|
+
first. Fix the common ones first.
|
|
72
|
+
- `sentences.wordsPerSentence` — the average sentence length (lower reads easier).
|
|
73
|
+
- `sentences.longSentences` — sentences over 30 words. These are the ones to
|
|
74
|
+
break up.
|
|
75
|
+
|
|
76
|
+
## What to do with it
|
|
77
|
+
|
|
78
|
+
These are **defaults, not rules**. Adapt them to your project, your reader, and
|
|
79
|
+
your taste — projects differ, and readers know different words. The signals show
|
|
80
|
+
you what to look at; you choose what to change.
|
|
81
|
+
|
|
82
|
+
**Hard words.** A hard word may have a simpler form (verify → check) — change it.
|
|
83
|
+
Or it may be a needed term (an api name), or a word the reader already knows (a
|
|
84
|
+
reader who learned English for an exam may find `implement` easier than "carry
|
|
85
|
+
out") — then keep it. You decide, based on the reader.
|
|
86
|
+
|
|
87
|
+
**Long sentences.** Split them: one idea per sentence, about 15–20 words,
|
|
88
|
+
subject then verb then object. Avoid chaining ideas with "which" / "that", and
|
|
89
|
+
avoid stacking nouns ("data migration rollback procedure" → "a plan to undo the
|
|
90
|
+
data change").
|
|
91
|
+
|
|
92
|
+
**A few habits that help, when they fit:**
|
|
93
|
+
- Keep a technical term, but explain it the first time ("idempotent — safe to
|
|
94
|
+
run more than once with the same result").
|
|
95
|
+
- Use the same word for the same thing every time (not ticket → task → work item).
|
|
96
|
+
- Drop idioms and slang ("a piece of cake" → "easy").
|
|
97
|
+
- Prefer short paragraphs, lists, and one example after each point.
|
|
98
|
+
|
|
99
|
+
Then run easyen again to check the signals improved.
|
|
100
|
+
|
|
101
|
+
## Notes
|
|
102
|
+
|
|
103
|
+
- Remove code and links from the text before you check it, so code does not
|
|
104
|
+
count as hard words.
|
|
105
|
+
- Keep the word list small — that is the point: a small list marks more words to
|
|
106
|
+
make simpler. Do not reach for a bigger list just to raise the score.
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Return every possible base form for a lower-cased word, including the word
|
|
3
|
+
* itself. The result is a de-duplicated array in a fixed, deterministic order
|
|
4
|
+
* (insertion order), so look-ups always try the forms the same way.
|
|
5
|
+
*/
|
|
6
|
+
export declare function possibleBaseForms(word: string): string[];
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.possibleBaseForms = possibleBaseForms;
|
|
4
|
+
/**
|
|
5
|
+
* Reduce an inflected word to a set of candidate base forms.
|
|
6
|
+
*
|
|
7
|
+
* "base form" = the dictionary form of a word: studies -> study, ran -> run.
|
|
8
|
+
* We do NOT need the single correct base form. We only ask: "is any plausible
|
|
9
|
+
* base form of this word in the dictionary?" So we generate several candidates
|
|
10
|
+
* by reversing common English ending rules, plus the irregular map, and let
|
|
11
|
+
* the caller test each one against the dictionary.
|
|
12
|
+
*/
|
|
13
|
+
const irregulars_1 = require("./irregulars");
|
|
14
|
+
/** A consonant doubling like "stopp" (ed) or "runn" (ing) -> drop one. */
|
|
15
|
+
function deDouble(stem) {
|
|
16
|
+
const n = stem.length;
|
|
17
|
+
if (n < 2)
|
|
18
|
+
return null;
|
|
19
|
+
const a = stem[n - 1];
|
|
20
|
+
const b = stem[n - 2];
|
|
21
|
+
if (a === b && !"aeiou".includes(a))
|
|
22
|
+
return stem.slice(0, -1);
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Add a stem and its near variants (bare, +"e", de-doubled) as candidates.
|
|
27
|
+
* Example: "mak" -> also "make"; "stopp" -> also "stop".
|
|
28
|
+
*/
|
|
29
|
+
function pushStemVariants(out, stem) {
|
|
30
|
+
if (stem.length >= 2)
|
|
31
|
+
out.add(stem);
|
|
32
|
+
out.add(stem + "e");
|
|
33
|
+
const dd = deDouble(stem);
|
|
34
|
+
if (dd && dd.length >= 2)
|
|
35
|
+
out.add(dd);
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Return every possible base form for a lower-cased word, including the word
|
|
39
|
+
* itself. The result is a de-duplicated array in a fixed, deterministic order
|
|
40
|
+
* (insertion order), so look-ups always try the forms the same way.
|
|
41
|
+
*/
|
|
42
|
+
function possibleBaseForms(word) {
|
|
43
|
+
const out = new Set();
|
|
44
|
+
out.add(word);
|
|
45
|
+
const irregular = irregulars_1.IRREGULARS[word];
|
|
46
|
+
if (irregular)
|
|
47
|
+
out.add(irregular);
|
|
48
|
+
// Too short to inflect meaningfully.
|
|
49
|
+
if (word.length <= 2)
|
|
50
|
+
return [...out];
|
|
51
|
+
// --- plural / 3rd person singular: -s / -es / -ies ---
|
|
52
|
+
if (word.endsWith("ies") && word.length > 4) {
|
|
53
|
+
out.add(word.slice(0, -3) + "y"); // studies -> study
|
|
54
|
+
}
|
|
55
|
+
if (word.endsWith("ves") && word.length > 4) {
|
|
56
|
+
out.add(word.slice(0, -3) + "f"); // wolves -> wolf
|
|
57
|
+
out.add(word.slice(0, -3) + "fe"); // knives -> knife
|
|
58
|
+
}
|
|
59
|
+
if (word.endsWith("es") && word.length > 3) {
|
|
60
|
+
out.add(word.slice(0, -2)); // boxes -> box, goes -> go
|
|
61
|
+
out.add(word.slice(0, -1)); // houses -> house
|
|
62
|
+
}
|
|
63
|
+
if (word.endsWith("s") && !word.endsWith("ss") && word.length > 3) {
|
|
64
|
+
out.add(word.slice(0, -1)); // cats -> cat
|
|
65
|
+
}
|
|
66
|
+
// --- past tense / past participle: -ed / -ied ---
|
|
67
|
+
if (word.endsWith("ied") && word.length > 4) {
|
|
68
|
+
out.add(word.slice(0, -3) + "y"); // studied -> study
|
|
69
|
+
}
|
|
70
|
+
if (word.endsWith("ed") && word.length > 3) {
|
|
71
|
+
pushStemVariants(out, word.slice(0, -2)); // walked->walk, liked->like, stopped->stop
|
|
72
|
+
}
|
|
73
|
+
// --- present participle / gerund: -ing ---
|
|
74
|
+
if (word.endsWith("ing") && word.length > 4) {
|
|
75
|
+
pushStemVariants(out, word.slice(0, -3)); // walking->walk, making->make, running->run
|
|
76
|
+
}
|
|
77
|
+
// --- comparative / superlative: -er / -est ---
|
|
78
|
+
if (word.endsWith("ier") && word.length > 4) {
|
|
79
|
+
out.add(word.slice(0, -3) + "y"); // happier -> happy
|
|
80
|
+
}
|
|
81
|
+
if (word.endsWith("iest") && word.length > 5) {
|
|
82
|
+
out.add(word.slice(0, -4) + "y"); // happiest -> happy
|
|
83
|
+
}
|
|
84
|
+
if (word.endsWith("er") && word.length > 3) {
|
|
85
|
+
pushStemVariants(out, word.slice(0, -2)); // bigger->big, taller->tall
|
|
86
|
+
}
|
|
87
|
+
if (word.endsWith("est") && word.length > 4) {
|
|
88
|
+
pushStemVariants(out, word.slice(0, -3)); // biggest->big, tallest->tall
|
|
89
|
+
}
|
|
90
|
+
// --- adverb: -ly ---
|
|
91
|
+
if (word.endsWith("ily") && word.length > 4) {
|
|
92
|
+
out.add(word.slice(0, -3) + "y"); // happily -> happy
|
|
93
|
+
}
|
|
94
|
+
if (word.endsWith("ly") && word.length > 3) {
|
|
95
|
+
out.add(word.slice(0, -2)); // quickly -> quick
|
|
96
|
+
}
|
|
97
|
+
return [...out];
|
|
98
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tiny predicates over a single token. Each answers one yes/no question and
|
|
3
|
+
* is pure.
|
|
4
|
+
*/
|
|
5
|
+
/** True when the word is a number like "3", "3.14" or "1,000". */
|
|
6
|
+
export declare function isNumber(token: string): boolean;
|
|
7
|
+
/** True when the token starts with an upper-case letter. */
|
|
8
|
+
export declare function isCapitalized(token: string): boolean;
|
|
9
|
+
/**
|
|
10
|
+
* True for a single-letter token that is not a real word. "a" and "i" are
|
|
11
|
+
* words; every other lone letter (b, e, g, x ...) comes from abbreviations,
|
|
12
|
+
* list markers or variable names and is not vocabulary.
|
|
13
|
+
*/
|
|
14
|
+
export declare function isSingleLetter(token: string): boolean;
|
|
15
|
+
/** True when the token is a spelled-out number word ("two", "zero"). */
|
|
16
|
+
export declare function isNumberWord(token: string): boolean;
|
|
17
|
+
/**
|
|
18
|
+
* True for an all-caps token of 2+ letters (AWS, JSON, SEO). These are
|
|
19
|
+
* acronyms, not vocabulary. Used only when the token is also unknown, so a
|
|
20
|
+
* normal word written in capitals (NOTE) still counts via its lower-case form.
|
|
21
|
+
*/
|
|
22
|
+
export declare function isAllCapitals(token: string): boolean;
|
package/dist/classify.js
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Tiny predicates over a single token. Each answers one yes/no question and
|
|
4
|
+
* is pure.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.isNumber = isNumber;
|
|
8
|
+
exports.isCapitalized = isCapitalized;
|
|
9
|
+
exports.isSingleLetter = isSingleLetter;
|
|
10
|
+
exports.isNumberWord = isNumberWord;
|
|
11
|
+
exports.isAllCapitals = isAllCapitals;
|
|
12
|
+
/** True when the word is a number like "3", "3.14" or "1,000". */
|
|
13
|
+
function isNumber(token) {
|
|
14
|
+
return /^[0-9]/.test(token);
|
|
15
|
+
}
|
|
16
|
+
/** True when the token starts with an upper-case letter. */
|
|
17
|
+
function isCapitalized(token) {
|
|
18
|
+
return /^[A-Z]/.test(token);
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* True for a single-letter token that is not a real word. "a" and "i" are
|
|
22
|
+
* words; every other lone letter (b, e, g, x ...) comes from abbreviations,
|
|
23
|
+
* list markers or variable names and is not vocabulary.
|
|
24
|
+
*/
|
|
25
|
+
function isSingleLetter(token) {
|
|
26
|
+
if (token.length !== 1)
|
|
27
|
+
return false;
|
|
28
|
+
const lower = token.toLowerCase();
|
|
29
|
+
return lower !== "a" && lower !== "i";
|
|
30
|
+
}
|
|
31
|
+
/** Spelled-out cardinal numbers and magnitudes. Treated like digits. */
|
|
32
|
+
const NUMBER_WORDS = new Set([
|
|
33
|
+
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
|
|
34
|
+
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
|
35
|
+
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty",
|
|
36
|
+
"forty", "fifty", "sixty", "seventy", "eighty", "ninety",
|
|
37
|
+
"hundred", "thousand", "million", "billion", "trillion",
|
|
38
|
+
]);
|
|
39
|
+
/** True when the token is a spelled-out number word ("two", "zero"). */
|
|
40
|
+
function isNumberWord(token) {
|
|
41
|
+
return NUMBER_WORDS.has(token.toLowerCase());
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* True for an all-caps token of 2+ letters (AWS, JSON, SEO). These are
|
|
45
|
+
* acronyms, not vocabulary. Used only when the token is also unknown, so a
|
|
46
|
+
* normal word written in capitals (NOTE) still counts via its lower-case form.
|
|
47
|
+
*/
|
|
48
|
+
function isAllCapitals(token) {
|
|
49
|
+
return token.length >= 2 && /^[A-Z]+$/.test(token);
|
|
50
|
+
}
|
package/dist/cli.d.ts
ADDED
package/dist/cli.js
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
/**
|
|
5
|
+
* Command-line interface for easyen. Reads text from stdin (the Unix way) and
|
|
6
|
+
* prints JSON, so it composes with other tools across platforms:
|
|
7
|
+
*
|
|
8
|
+
* cat doc.md | npx easyen --dict everyday,tech # macOS / Linux
|
|
9
|
+
* Get-Content doc.md | npx easyen # Windows PowerShell
|
|
10
|
+
* type doc.md | npx easyen # Windows cmd
|
|
11
|
+
*
|
|
12
|
+
* When piping is not convenient, read a file directly: npx easyen --file doc.md
|
|
13
|
+
*
|
|
14
|
+
* Zero dependencies: arguments are parsed by hand.
|
|
15
|
+
*/
|
|
16
|
+
const node_fs_1 = require("node:fs");
|
|
17
|
+
const index_1 = require("./index");
|
|
18
|
+
function parseArgs(argv) {
|
|
19
|
+
const args = { dict: "everyday", options: {}, help: false };
|
|
20
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
21
|
+
switch (argv[i]) {
|
|
22
|
+
case "-h":
|
|
23
|
+
case "--help":
|
|
24
|
+
args.help = true;
|
|
25
|
+
break;
|
|
26
|
+
case "-d":
|
|
27
|
+
case "--dict":
|
|
28
|
+
args.dict = argv[++i] ?? args.dict;
|
|
29
|
+
break;
|
|
30
|
+
case "-f":
|
|
31
|
+
case "--file":
|
|
32
|
+
args.file = argv[++i];
|
|
33
|
+
break;
|
|
34
|
+
case "--proper-nouns": // ignore likely proper nouns
|
|
35
|
+
args.options.ignoreProperNouns = true;
|
|
36
|
+
break;
|
|
37
|
+
case "--count-numbers": // count numbers instead of ignoring them
|
|
38
|
+
args.options.ignoreNumbers = false;
|
|
39
|
+
break;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return args;
|
|
43
|
+
}
|
|
44
|
+
const HELP = `easyen — check how easy an English text is to read. Prints JSON.
|
|
45
|
+
|
|
46
|
+
Reads text from stdin (pipe it in), the standard Unix way. Works on any shell
|
|
47
|
+
that supports pipes (bash, PowerShell, cmd). If piping is hard, use --file.
|
|
48
|
+
|
|
49
|
+
Usage:
|
|
50
|
+
cat file.md | easyen [options] # macOS / Linux
|
|
51
|
+
Get-Content file.md | easyen [options] # Windows PowerShell
|
|
52
|
+
easyen --file file.md [options] # any OS, no pipe
|
|
53
|
+
|
|
54
|
+
Options:
|
|
55
|
+
-d, --dict <spec> Word list to use (default: everyday). Comma-separated to
|
|
56
|
+
combine: each item is a built-in name OR a path to a
|
|
57
|
+
word-list file. e.g. --dict everyday,tech,./terms.txt
|
|
58
|
+
-f, --file <path> Read the text from a file instead of stdin
|
|
59
|
+
--proper-nouns Ignore capitalised unknown words (names, places)
|
|
60
|
+
--count-numbers Count numbers instead of ignoring them
|
|
61
|
+
-h, --help Show this help
|
|
62
|
+
|
|
63
|
+
Built-in word lists: ${(0, index_1.listDictionaries)().join(", ")}`;
|
|
64
|
+
/**
|
|
65
|
+
* Turn a --dict spec into a dictionary. A single built-in name is passed
|
|
66
|
+
* through (so its cached Set is reused); anything else (multiple items, or a
|
|
67
|
+
* file path) is read and merged into one Set.
|
|
68
|
+
*/
|
|
69
|
+
function resolveDictSpec(spec) {
|
|
70
|
+
const builtins = new Set((0, index_1.listDictionaries)());
|
|
71
|
+
const parts = spec.split(",").map((s) => s.trim()).filter(Boolean);
|
|
72
|
+
if (parts.length === 1 && builtins.has(parts[0]))
|
|
73
|
+
return parts[0];
|
|
74
|
+
const sources = parts.map((part) => builtins.has(part) ? part : (0, node_fs_1.readFileSync)(part, "utf8").split(/\s+/));
|
|
75
|
+
return (0, index_1.combineDictionaries)(...sources);
|
|
76
|
+
}
|
|
77
|
+
function main() {
|
|
78
|
+
const args = parseArgs(process.argv.slice(2));
|
|
79
|
+
// --help, or no input at all (no --file and nothing piped): show help.
|
|
80
|
+
if (args.help || (!args.file && process.stdin.isTTY)) {
|
|
81
|
+
console.log(HELP);
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
try {
|
|
85
|
+
const text = args.file ? (0, node_fs_1.readFileSync)(args.file, "utf8") : readStdin();
|
|
86
|
+
if (!text.trim()) {
|
|
87
|
+
console.error("No text given. Pipe text in, or use --file <path>.");
|
|
88
|
+
process.exitCode = 1;
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
const result = (0, index_1.checkCoverage)(text, resolveDictSpec(args.dict), args.options);
|
|
92
|
+
const sentences = (0, index_1.checkSentences)(text);
|
|
93
|
+
console.log(JSON.stringify({ ...result, sentences }, null, 2));
|
|
94
|
+
}
|
|
95
|
+
catch (error) {
|
|
96
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
97
|
+
process.exitCode = 1;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
function readStdin() {
|
|
101
|
+
try {
|
|
102
|
+
return (0, node_fs_1.readFileSync)(0, "utf8");
|
|
103
|
+
}
|
|
104
|
+
catch {
|
|
105
|
+
return "";
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
main();
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { DictionaryName } from "./dictionaries";
|
|
2
|
+
/** What you can pass as the dictionary argument. */
|
|
3
|
+
export type DictionarySource = DictionaryName | Iterable<string> | ReadonlySet<string>;
|
|
4
|
+
export interface CheckOptions {
|
|
5
|
+
/**
|
|
6
|
+
* Ignore pure-number tokens ("3", "3.14"). Numbers are not vocabulary, so
|
|
7
|
+
* they are excluded from the ratio by default.
|
|
8
|
+
* @default true
|
|
9
|
+
*/
|
|
10
|
+
ignoreNumbers?: boolean;
|
|
11
|
+
/**
|
|
12
|
+
* Ignore capitalised words that are not in the dictionary (likely proper
|
|
13
|
+
* nouns such as names or places). Note: this is a simple, position-free
|
|
14
|
+
* heuristic — with a small dictionary it may also drop a sentence-initial
|
|
15
|
+
* normal word. Use a full dictionary for best results.
|
|
16
|
+
* @default false
|
|
17
|
+
*/
|
|
18
|
+
ignoreProperNouns?: boolean;
|
|
19
|
+
}
|
|
20
|
+
export interface WordResult {
|
|
21
|
+
/** Lower-cased word that was checked. */
|
|
22
|
+
word: string;
|
|
23
|
+
/** Whether a base form of this word is in the dictionary. */
|
|
24
|
+
known: boolean;
|
|
25
|
+
/** The matching dictionary entry, if found (may differ from `word`). */
|
|
26
|
+
base?: string;
|
|
27
|
+
}
|
|
28
|
+
/** A hard word and how many times it appears in the text. */
|
|
29
|
+
export interface HardWord {
|
|
30
|
+
word: string;
|
|
31
|
+
count: number;
|
|
32
|
+
}
|
|
33
|
+
export interface CoverageResult {
|
|
34
|
+
/** Number of counted words (after the ignore filters). */
|
|
35
|
+
total: number;
|
|
36
|
+
/** Number of counted words found in the dictionary. */
|
|
37
|
+
covered: number;
|
|
38
|
+
/** covered / total, in [0, 1]. Is 0 when there are no counted words. */
|
|
39
|
+
ratio: number;
|
|
40
|
+
/** Hard words (not in the dictionary), unique and sorted A–Z. Reword these. */
|
|
41
|
+
hardWords: string[];
|
|
42
|
+
/** Hard words with how many times each appears, most frequent first. */
|
|
43
|
+
hardWordCounts: HardWord[];
|
|
44
|
+
/** Per-word result, in reading order. */
|
|
45
|
+
details: WordResult[];
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Combine several dictionary sources (built-in names, word lists, or Sets)
|
|
49
|
+
* into one lower-cased look-up Set. Lets a caller compose their own
|
|
50
|
+
* vocabulary — e.g. the everyday list plus a list of allowed domain words:
|
|
51
|
+
*
|
|
52
|
+
* const dict = combineDictionaries("everyday", ["api", "endpoint"]);
|
|
53
|
+
* checkCoverage(text, dict);
|
|
54
|
+
*/
|
|
55
|
+
export declare function combineDictionaries(...sources: DictionarySource[]): Set<string>;
|
|
56
|
+
/**
|
|
57
|
+
* Check what fraction of a text's words are covered by a vocabulary.
|
|
58
|
+
*
|
|
59
|
+
* @param text The text to check.
|
|
60
|
+
* @param dictionary A built-in dictionary name, a word list (base forms
|
|
61
|
+
* only), or a prebuilt Set.
|
|
62
|
+
* @param options See {@link CheckOptions}.
|
|
63
|
+
*/
|
|
64
|
+
export declare function checkCoverage(text: string, dictionary: DictionarySource, options?: CheckOptions): CoverageResult;
|
package/dist/coverage.js
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.combineDictionaries = combineDictionaries;
|
|
4
|
+
exports.checkCoverage = checkCoverage;
|
|
5
|
+
/**
|
|
6
|
+
* The top-level pipeline. `checkCoverage` only wires the small pure steps
|
|
7
|
+
* together; every real piece of logic lives in its own single-purpose
|
|
8
|
+
* function (normalize -> expand -> extract -> match -> summarize).
|
|
9
|
+
*/
|
|
10
|
+
const pipe_1 = require("./pipe");
|
|
11
|
+
const normalize_1 = require("./normalize");
|
|
12
|
+
const extract_1 = require("./extract");
|
|
13
|
+
const classify_1 = require("./classify");
|
|
14
|
+
const dictionary_1 = require("./dictionary");
|
|
15
|
+
const dictionaries_1 = require("./dictionaries");
|
|
16
|
+
/** Clean raw text into a space-separated, contraction-free string. */
|
|
17
|
+
const prepare = (0, pipe_1.pipe)(normalize_1.normalizeApostrophes, normalize_1.expandContractions);
|
|
18
|
+
/** Turn one extracted token into a WordResult, or null if it is filtered out. */
|
|
19
|
+
function classifyToken(token, dict, options) {
|
|
20
|
+
// Numbers and spelled-out number words are not vocabulary to grade.
|
|
21
|
+
if ((0, classify_1.isNumber)(token) || (0, classify_1.isNumberWord)(token)) {
|
|
22
|
+
return options.ignoreNumbers ? null : { word: token.toLowerCase(), known: false };
|
|
23
|
+
}
|
|
24
|
+
// A lone letter (b, e, g, x ...) is never a word; never count it.
|
|
25
|
+
if ((0, classify_1.isSingleLetter)(token))
|
|
26
|
+
return null;
|
|
27
|
+
const lower = token.toLowerCase();
|
|
28
|
+
const base = (0, dictionary_1.findInDictionary)(lower, dict);
|
|
29
|
+
if (base === null) {
|
|
30
|
+
// Unknown all-capitals word (AWS, JSON) is a short form, not vocabulary.
|
|
31
|
+
if ((0, classify_1.isAllCapitals)(token))
|
|
32
|
+
return null;
|
|
33
|
+
if (options.ignoreProperNouns && (0, classify_1.isCapitalized)(token))
|
|
34
|
+
return null;
|
|
35
|
+
return { word: lower, known: false };
|
|
36
|
+
}
|
|
37
|
+
return { word: lower, known: true, base };
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Combine several dictionary sources (built-in names, word lists, or Sets)
|
|
41
|
+
* into one lower-cased look-up Set. Lets a caller compose their own
|
|
42
|
+
* vocabulary — e.g. the everyday list plus a list of allowed domain words:
|
|
43
|
+
*
|
|
44
|
+
* const dict = combineDictionaries("everyday", ["api", "endpoint"]);
|
|
45
|
+
* checkCoverage(text, dict);
|
|
46
|
+
*/
|
|
47
|
+
function combineDictionaries(...sources) {
|
|
48
|
+
const set = new Set();
|
|
49
|
+
for (const source of sources) {
|
|
50
|
+
const words = typeof source === "string" ? (0, dictionaries_1.getDictionary)(source) : source;
|
|
51
|
+
for (const word of words) {
|
|
52
|
+
const trimmed = word.trim().toLowerCase();
|
|
53
|
+
if (trimmed)
|
|
54
|
+
set.add(trimmed);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return set;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Turn any accepted dictionary source into a fast look-up set. Built-in
|
|
61
|
+
* dictionaries are built once and cached, so repeated calls are cheap.
|
|
62
|
+
*/
|
|
63
|
+
const builtinCache = new Map();
|
|
64
|
+
function resolveDictionary(source) {
|
|
65
|
+
if (typeof source === "string") {
|
|
66
|
+
const cached = builtinCache.get(source);
|
|
67
|
+
if (cached)
|
|
68
|
+
return cached;
|
|
69
|
+
const set = (0, dictionary_1.buildDictionary)((0, dictionaries_1.getDictionary)(source));
|
|
70
|
+
builtinCache.set(source, set);
|
|
71
|
+
return set;
|
|
72
|
+
}
|
|
73
|
+
if (source instanceof Set)
|
|
74
|
+
return source;
|
|
75
|
+
return (0, dictionary_1.buildDictionary)(source);
|
|
76
|
+
}
|
|
77
|
+
/** Fold per-word results into the final coverage summary. */
|
|
78
|
+
function summarize(results) {
|
|
79
|
+
const counts = new Map();
|
|
80
|
+
let covered = 0;
|
|
81
|
+
for (const result of results) {
|
|
82
|
+
if (result.known)
|
|
83
|
+
covered += 1;
|
|
84
|
+
else
|
|
85
|
+
counts.set(result.word, (counts.get(result.word) ?? 0) + 1);
|
|
86
|
+
}
|
|
87
|
+
// Most frequent first; ties sorted A–Z for a stable, deterministic order.
|
|
88
|
+
const hardWordCounts = [...counts.entries()]
|
|
89
|
+
.map(([word, count]) => ({ word, count }))
|
|
90
|
+
.sort((a, b) => b.count - a.count || a.word.localeCompare(b.word));
|
|
91
|
+
const total = results.length;
|
|
92
|
+
return {
|
|
93
|
+
total,
|
|
94
|
+
covered,
|
|
95
|
+
ratio: total === 0 ? 0 : covered / total,
|
|
96
|
+
hardWords: [...counts.keys()].sort(),
|
|
97
|
+
hardWordCounts,
|
|
98
|
+
details: results,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Check what fraction of a text's words are covered by a vocabulary.
|
|
103
|
+
*
|
|
104
|
+
* @param text The text to check.
|
|
105
|
+
* @param dictionary A built-in dictionary name, a word list (base forms
|
|
106
|
+
* only), or a prebuilt Set.
|
|
107
|
+
* @param options See {@link CheckOptions}.
|
|
108
|
+
*/
|
|
109
|
+
function checkCoverage(text, dictionary, options = {}) {
|
|
110
|
+
const settings = {
|
|
111
|
+
ignoreNumbers: options.ignoreNumbers ?? true,
|
|
112
|
+
ignoreProperNouns: options.ignoreProperNouns ?? false,
|
|
113
|
+
};
|
|
114
|
+
const dict = resolveDictionary(dictionary);
|
|
115
|
+
const results = (0, extract_1.splitWords)(prepare(text))
|
|
116
|
+
.flatMap(extract_1.splitCamelCase) // getUserById -> get / User / By / Id
|
|
117
|
+
.map((token) => classifyToken(token, dict, settings))
|
|
118
|
+
.filter((result) => result !== null);
|
|
119
|
+
return summarize(results);
|
|
120
|
+
}
|