h3-words 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
File without changes
@@ -0,0 +1,473 @@
1
+ /**
2
+ * Builds a small, curated wordlist (default 8192 words).
3
+ * Loads dictionaries, normalizes, filters, removes phonetic collisions.
4
+ *
5
+ * Run: npm run build:wordlist
6
+ * Output: scripts/words.json, scripts/checksum.txt
7
+ *
8
+ * Output is also written to ./words.json for the app to import.
9
+ */
10
+ import fs from "node:fs";
11
+ import path from "node:path";
12
+ import crypto from "node:crypto";
13
+ import { fileURLToPath } from "node:url";
14
+ import wordlistEnglish from "wordlist-english";
15
+ import doubleMetaphone from "double-metaphone";
16
+ import * as badwords from "badwords-list";
17
+
18
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
19
+ const TARGET_SIZE = 8_192;
20
+
21
+ // Profanity / slur blacklist (we keep it simple: exact-word match after normalize())
22
+ const banned = new Set(
23
+ (badwords.array || [])
24
+ .map((w) => normalize(String(w)))
25
+ .filter(Boolean),
26
+ );
27
+ // Project-specific additions go here (normalized lowercase a-z only)
28
+ [
29
+ // ── Core: violence, death, slurs, sexual, drugs, insults, misc ──
30
+ "rape", "die", "cut", "gun",
31
+ "war", "dead", "death", "dying",
32
+ "corpse", "fatal", "harm", "hurt",
33
+ "kill", "murder", "homicide", "suicide",
34
+ "torture", "victim", "violence", "weapon",
35
+ "blood", "wound", "stab", "shoot",
36
+ "bomb", "attack", "assault", "strike",
37
+ "battle", "warfare", "bullet",
38
+ //"knife", "sword", "axe", "gut", "bullshit"
39
+ "gash", "gore",
40
+ "gory", "maim", "slay", "slit",
41
+ "hostage", "kidnap", "rot",
42
+ "handgun", "gunshot", "gunfire", "gunman",
43
+ "gunboat", "gunpoint", "wartime", "warlike",
44
+ "warpath", "warlord", "killing", "overkill",
45
+ "bombing", "bombard", "shooting", "weaponry",
46
+ "hurting", "hurtful", "harmful", "hateful",
47
+ "horrify", "terrify", "abusive", "cruelty",
48
+ "savagery", "suicidal", "sadistic", "penknife",
49
+ "deaden", "shotgun", "gunmen", "starve",
50
+ "starving", "beating", "behead", "thrash",
51
+ "bludgeon", "strangle", "hoodlum", "assassin",
52
+ "henchman", "redskin", "spastic", "niggling",
53
+ "arsehole", "schmuck", "slave",
54
+ "terror", "lynch", "hate", "abuse",
55
+ "racism", "racist", "race", "fascist",
56
+ "genocide", "cruel", "brutal", "savage",
57
+ "sexual", "sexist", "obscene", "sordid",
58
+ "nude", "orgy", "gay", "lesbian",
59
+ "pervert", "prurient", "sadism",
60
+ // "sexy", "toxic", "fear", "mad", "ill"
61
+ "condom", "virgin", "lewd", "harlot",
62
+ "pubic", "ravish", "seduce", "misogyny",
63
+ "vaginae", "vaginal", "heroin", "addict",
64
+ "poison", "venom", "sin", "grief", "panic",
65
+ "horror", "devil", "curse",
66
+ "wrath", "disease", "plague", "vomit",
67
+ "grave", "enemy", "abort", "evil",
68
+ "sinful", "hellish", "satanism", "warlock",
69
+ "fat", "bad", "dumb", "idiot",
70
+ "moron", "scum", "slag", "slur",
71
+ "guilty", "stupid", "dummy", "idiotic",
72
+ "lunatic", "madman", "crybaby", "madhouse",
73
+ "comma", "gag",
74
+ // "flu", "cry", "fail", "junk", "pain", "trash", "tear"
75
+ "rash", "sick", "colon",
76
+ "cough", "drown", "funeral", "ache",
77
+ "choke", "bore", "bury", "dump",
78
+ "err", "lie", "numb", "hang",
79
+ "hanging", "hung", "vice", "child",
80
+ "children", "affair", "extent", "regret",
81
+ "practise", "strip", "ashcan", "beaten",
82
+ "hating", "infant", "lethal", "abusing",
83
+ "bashing", "degrade", "deprave", "despise",
84
+ "lesbian", "teenage", "sixteen", "violate",
85
+ "violent", "aardvark", "abortion", "eighteen",
86
+ "fourteen", "fifteen", "teen", "nineteen",
87
+ "teenage", "hitherto", "invading", "invasion",
88
+ "maximise", "minimise", "optimise", "shouting",
89
+ "bra", "colic", "creak", "nymph",
90
+ "abduct", "cougar", "fetish", "morbid",
91
+ "pelvic", "rectal", "sexing", "sexism",
92
+ "asexual", "fascism", "jerking", "jinxing",
93
+ "macabre", "adultery", "bisexual", "erection",
94
+ "impotent", "flaccid", "colonist", "hooligan",
95
+ "christen", "rabbi", "emir", "fatty",
96
+ // ── Violence / Weapons / Fighting ──
97
+ "bash", "beat", "blade", "blast",
98
+ "brawl", "brute", "cage", "combat",
99
+ "crush", "cull", "fang", "feud",
100
+ "flog", "fury", "gang", "jail",
101
+ "lash", "loot", "lurk", "maul",
102
+ "prey", "punch", "purge", "rage",
103
+ "raid", "rant", "riot", "ruin",
104
+ "scald", "scalp", "slap", "slash",
105
+ "slug", "smash", "snarl", "stalk",
106
+ "swat", "thug", "thump", "whip",
107
+ "ambush", "avenge", "bandit", "invade",
108
+ "malice", "martyr", "mayhem", "napalm",
109
+ "ransom", "ravage", "revolt", "threat",
110
+ "afflict", "condemn", "convict", "crucify",
111
+ "culprit", "dungeon", "firearm", "harming",
112
+ "hostile", "inflict", "manhunt", "missile",
113
+ "outrage", "revenge", "treason", "arsonist",
114
+ "atrocity", "barbaric", "butchery", "cannibal",
115
+ "casualty", "corporal", "criminal", "desolate",
116
+ "despotic", "detonate", "dynamite", "frighten",
117
+ "fugitive", "imprison", "predator", "punching",
118
+ "ravaging", "sabotage", "savaging", "scalding",
119
+ "shrapnel", "slashing", "smashing", "stalking",
120
+ "stealing", "threaten", "throttle", "torching",
121
+ "vendetta", "wrecking",
122
+ // ── Death / Morbid / Horror ──
123
+ "crypt", "decay", "dirge", "doom",
124
+ "ghoul", "gloom", "pyre", "tomb",
125
+ "agony", "demon", "dread", "fiend",
126
+ "skull", "casket", "demise", "graven",
127
+ "cremate", "despair", "inferno", "serpent",
128
+ "sorcery", "tragedy", "turmoil", "vampire",
129
+ "vulture", "autopsy", "devilish", "dreadful",
130
+ "exorcism", "exorcist", "fatalism", "mournful",
131
+ "skeleton", "werewolf",
132
+ // ── Disgusting / Gross / Bodily ──
133
+ "bile", "cyst", "dung", "muck",
134
+ "ooze", "puke", "reek", "scab",
135
+ "scar", "slob", "slop", "snot",
136
+ "sore", "spew", "spit", "wart",
137
+ "welt", "bowel", "enema", "fetid",
138
+ "filth", "flesh", "phlegm", "snort",
139
+ "spasm", "maggot", "pimple", "putrid",
140
+ "rancid", "rotten", "sicken", "sludge",
141
+ "spleen", "vermin", "filthy", "gangrene",
142
+ "smallpox", "stinking", "virulent", "vomiting",
143
+ // ── Sexual / Body ──
144
+ "ogle", "teat", "womb", "bosom",
145
+ "harem", "erotic", "fondle", "sultry",
146
+ "indecent", "perverse",
147
+ // ── Offensive / Derogatory / Insults ──
148
+ "gob", "hag", "mob", "sod",
149
+ "jerk", "mock", "rude", "slum",
150
+ "vile", "beast", "bigot", "crazy",
151
+ "crude", "nasty", "scoff", "scorn",
152
+ "spite", "tramp", "asylum", "bedlam",
153
+ "defame", "defile", "deform", "deject",
154
+ "injure", "injury", "insult", "midget",
155
+ "ordeal", "orphan", "racial", "rancor",
156
+ "sadden", "stigma", "stolen", "stupor",
157
+ "anguish", "bigotry", "damning", "derange",
158
+ "disgust", "immoral", "leprosy", "mortify",
159
+ "painful", "uncouth", "vagrant", "neurotic",
160
+ "paranoia", "paranoid", "scornful", "spiteful",
161
+ "vengeful",
162
+ // ── Drugs / Alcohol / Vice ──
163
+ "dope", "drug", "booze", "drunk",
164
+ "opium", "tipsy", "torch", "toxin",
165
+ "alcohol", "drunken", "hashish", "morphine",
166
+ "syringe", "tobacco", "epidemic", "gambling",
167
+ "narcotic",
168
+ // ── Crime / derivatives ──
169
+ "aborting", "burying", "drowning", "raging",
170
+ "wail", "weep", "crime", "fraud",
171
+ "felon", "guilt", "havoc", "creep",
172
+ // ── Round 2: violence, death, disgusting, sexual, offensive, drugs ──
173
+ "bang", "duel", "fist", "foul",
174
+ "grim", "hurl", "raze", "rend",
175
+ "slam", "stun", "suck", "arson",
176
+ "ghost", "swear", "theft", "thief",
177
+ "wreck", "devour", "famine", "hijack",
178
+ "infect", "scream", "tragic", "vandal",
179
+ "anthrax", "berserk", "brutish", "burglar",
180
+ "dooming", "fearful", "larceny", "lashing",
181
+ "lurking", "machete", "rampage", "rancour",
182
+ "ransack", "ruining", "scourge", "amputate",
183
+ "coercive", "crucifix", "decimate", "disfavor",
184
+ "disgrace", "dishonor", "doomsday", "fatality",
185
+ "fearsome", "fiendish", "gangland", "horrific",
186
+ "injuring", "maltreat", "stampede", "coma",
187
+ "mourn", "mortal", "effigy", "embalm",
188
+ "epitaph", "obituary", "comatose", "stricken",
189
+ "mange", "mangy", "louse", "leech",
190
+ "groin", "scorpion", "toxicity", "lusty",
191
+ "lurid", "grope", "moaning", "puberty",
192
+ "sensual", "fondling", "foreskin", "seducing",
193
+ "cur", "foe", "woe", "cult",
194
+ "snob", "livid", "loony", "lousy",
195
+ "mania", "manic", "noose", "obese",
196
+ "scold", "amoral", "enmity", "extort",
197
+ "horrid", "infamy", "misery", "offend",
198
+ "harelip", "anorexia", "anorexic", "dementia",
199
+ "derelict", "derision", "shameful", "druggist",
200
+ "drunkard", "lobotomy", "overdose",
201
+ // ── Round 3: triggering (sexual, violent, derogatory, disease) ──
202
+ "moan", "rabid", "lice", "loin",
203
+ "hump", "kink", "loon", "oaf",
204
+ "gout", "yuck", "burp", "racy",
205
+ "runt", "bawdy", "booty", "buxom",
206
+ "dunce", "dusky", "elegy", "hick",
207
+ "ninny", "pansy", "pokey", "pokie",
208
+ "polio", "prude", "pudgy", "booby",
209
+ "skulk", "waif", "peon", "retch",
210
+ "saucy", "chaste", "climax", "idiocy",
211
+ "inmate", "nudist", "pummel", "urchin",
212
+ "hemlock", "heathen", "infidel", "jilting",
213
+ "lisping", "matador", "megaton", "piggish",
214
+ "prudish", "ranting", "rending", "servile",
215
+ "smiting", "thieve", "doleful", "ogling",
216
+ "snarling", "snorting", "humping", "spanking",
217
+ "burglary", "diatribe", "emaciate", "henchmen",
218
+ "insolent", "minstrel", "petulant", "polygamy",
219
+ "swarthy", "vagabond", "welshing", "acrimony",
220
+ // ── Culturally sensitive ──
221
+ "wigwam", "tomahawk",
222
+ // ── Obscure 3-letter ──
223
+ "sic", "qua", "avo", "caw",
224
+ "chi", "eke", "fen", "fez",
225
+ "fro", "gnu", "hew", "lop",
226
+ "lye", "mew", "nee", "ova",
227
+ "pap", "roe", "sac", "sop",
228
+ "sup", "wan", "wot", "zac",
229
+ "gee", "coo",
230
+ // ── Obscure 4-letter ──
231
+ "afto", "ambo", "arvo", "bani",
232
+ "baud", "bide", "bonz", "boor",
233
+ "bray", "calk", "cark", "cede",
234
+ "czar", "daub", "devo", "dour",
235
+ "eave", "esky", "fogy", "furl",
236
+ "gday", "geld", "gibe", "gild",
237
+ "gilt", "glib", "goad", "gybe",
238
+ "hale", "hark", "hart", "hind",
239
+ "hock", "hoon", "hove", "jute",
240
+ "lade", "lath", "laud", "lieu",
241
+ "loam", "loll", "lope", "lute",
242
+ "lyre", "mien", "mire", "morn",
243
+ "nigh", "nong", "onya", "pall",
244
+ "pate", "prow", "rort", "shod",
245
+ "sire", "toga", "tsar", "twee",
246
+ "wist", "woop", "yowl", "zack",
247
+ // ── Obscure 5-letter ──
248
+ "abhor", "acrid", "aftie", "allay",
249
+ "annul", "aorta", "arbor", "ardor",
250
+ "banal", "baulk", "bayou", "beady",
251
+ "befit", "belie", "beset", "biffo",
252
+ "bluey", "bonza", "bonze", "briny",
253
+ "broil", "bundy", "burro", "butte",
254
+ "byway", "calve", "chafe", "chaff",
255
+ "chasm", "cheep", "chide", "chook",
256
+ "circa", "clime", "cooee", "croon",
257
+ "curio", "decry", "deify", "deign",
258
+ "ditty", "doona", "dowdy", "downy",
259
+ "dowry", "droll", "dunno", "dunny",
260
+ "exalt", "extol", "exude", "exult",
261
+ "fiche", "filch", "flout", "fogey",
262
+ "foist", "fount", "frond", "galah",
263
+ "garbo", "genii", "gimme", "glade",
264
+ "gnarl", "gnash", "gonna", "goodo",
265
+ "gruel", "guile", "gulch", "halon",
266
+ "infix", "inlay", "junta", "kanga",
267
+ "kylie", "leery", "lemme", "lupin",
268
+ "lymph", "mamma", "miaow", "moult",
269
+ "mousy", "natty", "newsy", "nulla",
270
+ "octal", "oking", "olden", "passe",
271
+ "piety", "pique", "pithy", "primp",
272
+ "privy", "psalm", "quark", "radii",
273
+ "ravel", "razoo", "rebut", "remit",
274
+ "revue", "rouge", "rouse", "ruddy",
275
+ "ruing", "sabre", "unman",
276
+ // ── Obscure 6-letter ──
277
+ "acumen", "aflame", "amidst", "aplomb",
278
+ "arable", "ardent", "belfry", "bemoan",
279
+ "bingle", "blithe", "bodice", "bovine",
280
+ "bunyip", "bushel", "cavort", "chintz",
281
+ "cloven", "cogent", "cortex", "damsel",
282
+ "dazing", "desist", "dinkum", "disuse",
283
+ "dorsal", "endive", "fervor", "foment",
284
+ "geeing", "gentry", "girdle", "gutful",
285
+ "haling", "hobnob", "hubbub", "hymnal",
286
+ "jading", "jibing", "junket", "laming",
287
+ "laxity", "lichen", "loping", "lustre",
288
+ "madcap", "madmen", "meagre", "mohair",
289
+ "myopic", "nicety", "optima", "paltry",
290
+ "papacy", "papyri", "pedlar", "planar",
291
+ "ramrod", "rankle", "rebuke", "recant",
292
+ "regent", "repose", "revere", "rubric",
293
+ "stanch", "stolid", "theist", "thence",
294
+ "thresh", "thrice", "turgid",
295
+ // ── Obscure 7-letter ──
296
+ "askance", "attache", "belabor", "calculi",
297
+ "caustic", "cistern", "coarsen", "cogency",
298
+ "congeal", "corsage", "dualism", "dunging",
299
+ "earbash", "hackney", "hymning", "insipid",
300
+ "jocular", "jumbuck", "languid", "languor",
301
+ "latrine", "lexical", "lexicon", "liturgy",
302
+ "maudlin", "polemic", "rostrum", "sojourn",
303
+ "stoical", "swagman", "tankard", "treacle",
304
+ "vestige",
305
+ // ── Obscure 8-letter ──
306
+ "abstruse", "brackish", "cherubim", "claptrap",
307
+ "conjugal", "consular", "dillybag", "doggerel",
308
+ "esophagi", "filigree", "guttural", "indolent",
309
+ "larrikin", "mandible", "monastic", "opaquing",
310
+ "pastiche", "pedagogy", "primeval", "putative",
311
+ "skerrick", "taciturn", "teletype", "thesauri",
312
+ "ubiquity", "vehement", "venerate", "verbiage",
313
+ "vestment", "yarmulke",
314
+ ].forEach((w) => banned.add(normalize(w)));
315
+
316
+ // ---------- helpers ----------
317
+
318
+ function normalize(word) {
319
+ return word
320
+ .toLowerCase()
321
+ .normalize("NFKD")
322
+ .replace(/[^a-z]/g, "");
323
+ }
324
+
325
+ // BIP39-style-ish: readable, avoid awkward patterns. We can be stricter since we only need 8192.
326
+ function isValid(word) {
327
+ // Allow 4-letter words to increase the tier 10/20 pool (more familiar words).
328
+ if (word.length < 3 || word.length > 8) return false;
329
+ if (banned.has(word)) return false;
330
+ if (/(.)\1\1/.test(word)) return false; // aaa
331
+ if (/(qzx|xq|jj)/.test(word)) return false; // ugly patterns
332
+ if (word.endsWith("s")) return false;
333
+ if (word.endsWith("ed")) return false;
334
+ if (word.endsWith("ly")) return false;
335
+ if (word.endsWith("er")) return false;
336
+ if (word.endsWith("est")) return false;
337
+ return true;
338
+ }
339
+
340
+ // ---------- load raw sources (frequency-ordered: common first) ----------
341
+
342
+ // Track best (lowest) frequency tier per raw word
343
+ const bestTierByRaw = new Map();
344
+ const orderedRaw = [];
345
+
346
+ // Popularity proxy: lower tier = more common.
347
+ // We'll prioritize 10/20 strongly, and only fall back to 35/40 if needed.
348
+ const frequencyTiers = [10, 20, 35, 40];
349
+ const COMMON_MAX_TIER = 20;
350
+ const WORD1_SIZE = 4096;
351
+ [
352
+ "english",
353
+ "english/american",
354
+ "english/british",
355
+ "english/australian",
356
+ "english/canadian",
357
+ ].forEach((group) => {
358
+ frequencyTiers.forEach((freq) => {
359
+ const key = group + "/" + freq;
360
+ wordlistEnglish[key]?.forEach((w) => {
361
+ const prev = bestTierByRaw.get(w);
362
+ if (prev === undefined) {
363
+ bestTierByRaw.set(w, freq);
364
+ orderedRaw.push(w);
365
+ } else if (freq < prev) {
366
+ bestTierByRaw.set(w, freq);
367
+ }
368
+ });
369
+ });
370
+ });
371
+
372
+ console.log(`Raw words (frequency-ordered): ${orderedRaw.length}`);
373
+
374
+ // ---------- normalize + filter (preserve frequency order) ----------
375
+
376
+ // For normalized words, keep the best tier we've seen.
377
+ const bestTierByNorm = new Map();
378
+ let filtered = [];
379
+
380
+ for (const w of orderedRaw) {
381
+ const n = normalize(w);
382
+ if (!n) continue;
383
+ if (!isValid(n)) continue;
384
+ const tier = bestTierByRaw.get(w) ?? 999;
385
+ const prevTier = bestTierByNorm.get(n);
386
+ if (prevTier === undefined || tier < prevTier) {
387
+ bestTierByNorm.set(n, tier);
388
+ }
389
+ }
390
+
391
+ for (const [w, tier] of bestTierByNorm.entries()) {
392
+ filtered.push({ w, tier });
393
+ }
394
+
395
+ console.log(`After normalization/filtering: ${filtered.length}`);
396
+
397
+ // ---------- phonetic collision removal (prefer one word per metaphone code) ----------
398
+
399
+ const metaphoneMap = new Map();
400
+ const phoneticUnique = [];
401
+
402
+ // Sort candidates by popularity first so we keep the "most common" word per metaphone code.
403
+ filtered.sort((a, b) => a.tier - b.tier || a.w.length - b.w.length || a.w.localeCompare(b.w));
404
+
405
+ for (const { w: word, tier } of filtered) {
406
+ const [primary] = doubleMetaphone(word);
407
+ if (!primary) continue;
408
+ if (metaphoneMap.has(primary)) continue;
409
+
410
+ metaphoneMap.set(primary, word);
411
+ phoneticUnique.push({ w: word, tier });
412
+ }
413
+
414
+ console.log(`After phonetic filtering (unique): ${phoneticUnique.length}`);
415
+
416
+ // If we have enough phonetically unique, use them; otherwise fill from filtered
417
+ const used = new Set(phoneticUnique.map((x) => x.w));
418
+ let final = [...phoneticUnique];
419
+
420
+ if (final.length < TARGET_SIZE) {
421
+ for (const { w: word, tier } of filtered) {
422
+ if (used.has(word)) continue;
423
+ final.push({ w: word, tier });
424
+ if (final.length >= TARGET_SIZE) break;
425
+ }
426
+ }
427
+
428
+ console.log(`After fill: ${final.length}`);
429
+
430
+ if (final.length < TARGET_SIZE) {
431
+ throw new Error("Not enough words — relax filters or add sources");
432
+ }
433
+
434
+ // ---------- Ordering: popularity first, then short, then alpha ----------
435
+ // This makes early indices skew toward common vocabulary.
436
+ final.sort((a, b) => a.tier - b.tier || a.w.length - b.w.length || a.w.localeCompare(b.w));
437
+
438
+ // ---------- truncate ----------
439
+
440
+ // Ensure the "word1" pool is as curated/common as possible:
441
+ // - Prefer to use ONLY tiers 10/20 if we have enough to fill the whole list.
442
+ // - Always ensure the first WORD1_SIZE entries (used for word1 in encoding) come from tiers 10/20.
443
+ const common = final.filter((x) => x.tier <= COMMON_MAX_TIER);
444
+ const fallback = final.filter((x) => x.tier > COMMON_MAX_TIER);
445
+
446
+ if (common.length < WORD1_SIZE) {
447
+ throw new Error(
448
+ `Not enough tier<=${COMMON_MAX_TIER} words for first ${WORD1_SIZE} entries. Got ${common.length}. Relax filters or add sources.`,
449
+ );
450
+ }
451
+
452
+ let ordered = common;
453
+ if (common.length < TARGET_SIZE) {
454
+ ordered = common.concat(fallback);
455
+ }
456
+
457
+ // Force the first WORD1_SIZE to be common (10/20), even if we needed fallback later.
458
+ const output = ordered.slice(0, TARGET_SIZE).map((x) => x.w);
459
+
460
+ // ---------- write ----------
461
+
462
+ const outDir = path.join(__dirname);
463
+ const jsonStr = JSON.stringify(output, null, 2);
464
+
465
+ fs.writeFileSync(path.join(outDir, "words.json"), jsonStr);
466
+ fs.writeFileSync(
467
+ path.join(outDir, "checksum.txt"),
468
+ crypto.createHash("sha256").update(output.join("\n")).digest("hex")
469
+ );
470
+
471
+ console.log("✅ Done");
472
+ console.log(`Final size: ${output.length}`);
473
+ console.log(`Written to scripts/words.json`);
@@ -0,0 +1 @@
1
+ 969be24dcff1651fb48360a77d470021771b0f3f1ff93664d94fba8ad19d454f