@ingglish/fallback 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,626 @@
1
+ // src/index.ts
2
+ import { getCustomPronunciation } from "@ingglish/dictionary";
3
+ import { wordToArpabetTraced, wordToPhonetic } from "@ingglish/g2p";
4
+ import { arpabetToFormat as arpabetToFormat5 } from "@ingglish/phonemes";
5
+
6
+ // src/acronyms.ts
7
+ import { arpabetToFormat } from "@ingglish/phonemes";
8
+ var INITIALISM_EXPANSIONS = {
9
+ ac: ["alternating", "current"],
10
+ ad: ["anno", "domini"],
11
+ afk: ["away", "from", "keyboard"],
12
+ // AI / ML
13
+ ai: ["artificial", "intelligence"],
14
+ aka: ["also", "known", "as"],
15
+ am: ["ante", "meridiem"],
16
+ api: ["application", "programming", "interface"],
17
+ asap: ["as", "soon", "as", "possible"],
18
+ atm: ["automated", "teller", "machine"],
19
+ aws: ["amazon", "web", "services"],
20
+ b2b: ["business", "to", "business"],
21
+ b2c: ["business", "to", "consumer"],
22
+ bc: ["before", "christ"],
23
+ bdd: ["behavior", "driven", "development"],
24
+ brb: ["be", "right", "back"],
25
+ btw: ["by", "the", "way"],
26
+ byob: ["bring", "your", "own", "bottle"],
27
+ cd: ["continuous", "deployment"],
28
+ cdn: ["content", "delivery", "network"],
29
+ // Business / titles
30
+ ceo: ["chief", "executive", "officer"],
31
+ cfo: ["chief", "financial", "officer"],
32
+ ci: ["continuous", "integration"],
33
+ cia: ["central", "intelligence", "agency"],
34
+ cio: ["chief", "information", "officer"],
35
+ cli: ["command", "line", "interface"],
36
+ cmo: ["chief", "marketing", "officer"],
37
+ cms: ["content", "management", "system"],
38
+ cnn: ["convolutional", "neural", "network"],
39
+ coo: ["chief", "operating", "officer"],
40
+ cors: ["cross", "origin", "resource", "sharing"],
41
+ cpr: ["cardiopulmonary", "resuscitation"],
42
+ cpu: ["central", "processing", "unit"],
43
+ crm: ["customer", "relationship", "management"],
44
+ crud: ["create", "read", "update", "delete"],
45
+ cso: ["chief", "security", "officer"],
46
+ css: ["cascading", "style", "sheets"],
47
+ csv: ["comma", "separated", "values"],
48
+ cto: ["chief", "technology", "officer"],
49
+ dc: ["direct", "current"],
50
+ ddos: ["distributed", "denial", "of", "service"],
51
+ dea: ["drug", "enforcement", "administration"],
52
+ diy: ["do", "it", "yourself"],
53
+ dj: ["disc", "jockey"],
54
+ dmv: ["department", "of", "motor", "vehicles"],
55
+ // Medical / science
56
+ dna: ["deoxyribonucleic", "acid"],
57
+ dns: ["domain", "name", "system"],
58
+ eod: ["end", "of", "day"],
59
+ epa: ["environmental", "protection", "agency"],
60
+ er: ["emergency", "room"],
61
+ erp: ["enterprise", "resource", "planning"],
62
+ eta: ["estimated", "time", "of", "arrival"],
63
+ etl: ["extract", "transform", "load"],
64
+ eu: ["european", "union"],
65
+ faq: ["frequently", "asked", "questions"],
66
+ // Government / organizations
67
+ fbi: ["federal", "bureau", "of", "investigation"],
68
+ fda: ["food", "and", "drug", "administration"],
69
+ ftp: ["file", "transfer", "protocol"],
70
+ fyi: ["for", "your", "information"],
71
+ gan: ["generative", "adversarial", "network"],
72
+ gcp: ["google", "cloud", "platform"],
73
+ gif: ["graphics", "interchange", "format"],
74
+ gps: ["global", "positioning", "system"],
75
+ gpt: ["generative", "pre-trained", "transformer"],
76
+ gpu: ["graphics", "processing", "unit"],
77
+ gui: ["graphical", "user", "interface"],
78
+ hdd: ["hard", "disk", "drive"],
79
+ hiv: ["human", "immunodeficiency", "virus"],
80
+ hr: ["human", "resources"],
81
+ html: ["hypertext", "markup", "language"],
82
+ http: ["hypertext", "transfer", "protocol"],
83
+ https: ["hypertext", "transfer", "protocol", "secure"],
84
+ iaas: ["infrastructure", "as", "a", "service"],
85
+ icu: ["intensive", "care", "unit"],
86
+ // General / common
87
+ id: ["identification"],
88
+ ide: ["integrated", "development", "environment"],
89
+ idk: ["i", "don't", "know"],
90
+ imo: ["in", "my", "opinion"],
91
+ io: ["input", "output"],
92
+ ip: ["internet", "protocol"],
93
+ iq: ["intelligence", "quotient"],
94
+ irs: ["internal", "revenue", "service"],
95
+ isp: ["internet", "service", "provider"],
96
+ it: ["information", "technology"],
97
+ jpeg: ["joint", "photographic", "experts", "group"],
98
+ // Media
99
+ jpg: ["joint", "photographic", "experts", "group"],
100
+ json: ["javascript", "object", "notation"],
101
+ jwt: ["json", "web", "token"],
102
+ kpi: ["key", "performance", "indicator"],
103
+ // Networking
104
+ lan: ["local", "area", "network"],
105
+ lcd: ["liquid", "crystal", "display"],
106
+ led: ["light", "emitting", "diode"],
107
+ llm: ["large", "language", "model"],
108
+ mc: ["master", "of", "ceremonies"],
109
+ // Security
110
+ mfa: ["multi", "factor", "authentication"],
111
+ ml: ["machine", "learning"],
112
+ mp3: ["moving", "picture", "experts", "group", "audio", "layer"],
113
+ mp4: ["moving", "picture", "experts", "group"],
114
+ mph: ["miles", "per", "hour"],
115
+ mri: ["magnetic", "resonance", "imaging"],
116
+ mvp: ["minimum", "viable", "product"],
117
+ nasa: ["national", "aeronautics", "space", "administration"],
118
+ // Acronyms pronounced as words (pass through unchanged like initialisms)
119
+ nato: ["north", "atlantic", "treaty", "organization"],
120
+ nda: ["non", "disclosure", "agreement"],
121
+ nic: ["network", "interface", "card"],
122
+ nlp: ["natural", "language", "processing"],
123
+ nosql: ["not", "only", "sql"],
124
+ nsa: ["national", "security", "agency"],
125
+ nsfw: ["not", "safe", "for", "work"],
126
+ nyc: ["new", "york", "city"],
127
+ omg: ["oh", "my", "god"],
128
+ ooo: ["out", "of", "office"],
129
+ // Development
130
+ oop: ["object", "oriented", "programming"],
131
+ orm: ["object", "relational", "mapping"],
132
+ os: ["operating", "system"],
133
+ otp: ["one", "time", "password"],
134
+ paas: ["platform", "as", "a", "service"],
135
+ pc: ["personal", "computer"],
136
+ pdf: ["portable", "document", "format"],
137
+ php: ["hypertext", "preprocessor"],
138
+ pm: ["post", "meridiem"],
139
+ png: ["portable", "network", "graphics"],
140
+ pov: ["point", "of", "view"],
141
+ pr: ["public", "relations"],
142
+ pto: ["paid", "time", "off"],
143
+ qa: ["quality", "assurance"],
144
+ rag: ["retrieval", "augmented", "generation"],
145
+ ram: ["random", "access", "memory"],
146
+ rfp: ["request", "for", "proposal"],
147
+ rip: ["rest", "in", "peace"],
148
+ rna: ["ribonucleic", "acid"],
149
+ rnn: ["recurrent", "neural", "network"],
150
+ roi: ["return", "on", "investment"],
151
+ rom: ["read", "only", "memory"],
152
+ rpm: ["revolutions", "per", "minute"],
153
+ rsvp: ["please", "respond"],
154
+ rv: ["recreational", "vehicle"],
155
+ // Cloud / services
156
+ saas: ["software", "as", "a", "service"],
157
+ sdk: ["software", "development", "kit"],
158
+ sftp: ["secure", "file", "transfer", "protocol"],
159
+ sla: ["service", "level", "agreement"],
160
+ // Database
161
+ sql: ["structured", "query", "language"],
162
+ ssd: ["solid", "state", "drive"],
163
+ ssh: ["secure", "shell"],
164
+ ssl: ["secure", "sockets", "layer"],
165
+ suv: ["sport", "utility", "vehicle"],
166
+ svg: ["scalable", "vector", "graphics"],
167
+ tba: ["to", "be", "announced"],
168
+ tbd: ["to", "be", "determined"],
169
+ tcp: ["transmission", "control", "protocol"],
170
+ tdd: ["test", "driven", "development"],
171
+ tldr: ["too", "long", "didn't", "read"],
172
+ tls: ["transport", "layer", "security"],
173
+ tv: ["television"],
174
+ uat: ["user", "acceptance", "testing"],
175
+ udp: ["user", "datagram", "protocol"],
176
+ // User interface / experience
177
+ ui: ["user", "interface"],
178
+ uk: ["united", "kingdom"],
179
+ un: ["united", "nations"],
180
+ uri: ["uniform", "resource", "identifier"],
181
+ // Tech / web
182
+ url: ["uniform", "resource", "locator"],
183
+ us: ["united", "states"],
184
+ usa: ["united", "states", "of", "america"],
185
+ // Hardware
186
+ usb: ["universal", "serial", "bus"],
187
+ uv: ["ultraviolet"],
188
+ ux: ["user", "experience"],
189
+ vp: ["vice", "president"],
190
+ vpn: ["virtual", "private", "network"],
191
+ wan: ["wide", "area", "network"],
192
+ xml: ["extensible", "markup", "language"],
193
+ xss: ["cross", "site", "scripting"]
194
+ };
195
+ var MAX_INITIALISM_LENGTH = 5;
196
+ var LETTER_PHONEMES = {
197
+ a: ["EY1"],
198
+ b: ["B", "IY1"],
199
+ c: ["S", "IY1"],
200
+ d: ["D", "IY1"],
201
+ e: ["IY1"],
202
+ f: ["EH1", "F"],
203
+ g: ["JH", "IY1"],
204
+ h: ["EY1", "CH"],
205
+ i: ["AY1"],
206
+ j: ["JH", "EY1"],
207
+ k: ["K", "EY1"],
208
+ l: ["EH1", "L"],
209
+ m: ["EH1", "M"],
210
+ n: ["EH1", "N"],
211
+ o: ["OW1"],
212
+ p: ["P", "IY1"],
213
+ q: ["K", "Y", "UW1"],
214
+ r: ["AA1", "R"],
215
+ s: ["EH1", "S"],
216
+ t: ["T", "IY1"],
217
+ u: ["Y", "UW1"],
218
+ v: ["V", "IY1"],
219
+ w: ["D", "AH1", "B", "AH0", "L", "Y", "UW0"],
220
+ x: ["EH1", "K", "S"],
221
+ y: ["W", "AY1"],
222
+ z: ["Z", "IY1"]
223
+ };
224
+ var KNOWN_INITIALISMS = new Set(Object.keys(INITIALISM_EXPANSIONS));
225
+ function isInitialism(word) {
226
+ if (word.length > MAX_INITIALISM_LENGTH) {
227
+ return false;
228
+ }
229
+ return KNOWN_INITIALISMS.has(word.toLowerCase());
230
+ }
231
+ var INITIALISM_SUFFIXES = ["'s", "s"];
232
+ function parseInitialismWithSuffix(word) {
233
+ for (const suffix of INITIALISM_SUFFIXES) {
234
+ if (word.length > suffix.length && word.endsWith(suffix)) {
235
+ const base = word.slice(0, -suffix.length);
236
+ if (isInitialism(base)) {
237
+ return { base, suffix };
238
+ }
239
+ }
240
+ }
241
+ return null;
242
+ }
243
+ function translateAsAcronym(word, format = "ingglish") {
244
+ const arpabet = [];
245
+ for (const char of word.toLowerCase()) {
246
+ const letterArpabet = LETTER_PHONEMES[char];
247
+ if (letterArpabet !== void 0) {
248
+ arpabet.push(...letterArpabet);
249
+ }
250
+ }
251
+ return arpabetToFormat(arpabet, format);
252
+ }
253
+
254
+ // src/british.ts
255
+ import { lookupPronunciation } from "@ingglish/dictionary";
256
+ import { arpabetToFormat as arpabetToFormat2 } from "@ingglish/phonemes";
257
+ var BRITISH_TO_AMERICAN = [
258
+ // -isation → -ization (must come before -ise)
259
+ { pattern: /isation$/, replacement: "ization" },
260
+ // -ise → -ize (realise→realize, organise→organize)
261
+ { pattern: /ise$/, replacement: "ize" },
262
+ // -our → -or (colour→color, favour→favor)
263
+ { pattern: /our$/, replacement: "or" },
264
+ // -oured → -ored (coloured→colored, favoured→favored)
265
+ { pattern: /oured$/, replacement: "ored" },
266
+ // -ouring → -oring (colouring→coloring)
267
+ { pattern: /ouring$/, replacement: "oring" },
268
+ // -ourable → -orable (favourable→favorable)
269
+ { pattern: /ourable$/, replacement: "orable" },
270
+ // -re → -er (centre→center, theatre→theater)
271
+ // Only after consonants to avoid matching normal -re words
272
+ { pattern: /([a-z])re$/, replacement: "$1er" },
273
+ // -lled → -led (travelled→traveled, cancelled→canceled)
274
+ { pattern: /lled$/, replacement: "led" },
275
+ // -lling → -ling (travelling→traveling, cancelling→canceling)
276
+ { pattern: /lling$/, replacement: "ling" },
277
+ // -ller → -ler (traveller→traveler)
278
+ { pattern: /ller$/, replacement: "ler" },
279
+ // -ence → -ense (defence→defense, offence→offense)
280
+ { pattern: /ence$/, replacement: "ense" },
281
+ // -ogue → -og (catalogue→catalog, dialogue→dialog)
282
+ { pattern: /ogue$/, replacement: "og" },
283
+ // -ae- → -e- (anaesthetic→anesthetic, paediatric→pediatric)
284
+ { pattern: /ae/, replacement: "e" },
285
+ // -oe- → -e- (foetus→fetus, oestrogen→estrogen)
286
+ { pattern: /oe/, replacement: "e" },
287
+ // -ey → -y (curtsey→curtsy)
288
+ { pattern: /ey$/, replacement: "y" },
289
+ // grey → gray
290
+ { pattern: /grey/, replacement: "gray" }
291
+ ];
292
+ function matchBritish(word) {
293
+ const lower = word.toLowerCase();
294
+ for (const { pattern, replacement } of BRITISH_TO_AMERICAN) {
295
+ if (pattern.test(lower)) {
296
+ const american = lower.replace(pattern, replacement);
297
+ if (american !== lower) {
298
+ const phonemes = lookupPronunciation(american);
299
+ if (phonemes) {
300
+ return { american, phonemes };
301
+ }
302
+ }
303
+ }
304
+ }
305
+ return null;
306
+ }
307
+ function translateAsBritish(word, format = "ingglish") {
308
+ const match = matchBritish(word);
309
+ if (match === null) {
310
+ return null;
311
+ }
312
+ return arpabetToFormat2(match.phonemes, format);
313
+ }
314
+
315
+ // src/compounds.ts
316
+ import { lookupPronunciation as lookupPronunciation2, getWordFrequency } from "@ingglish/dictionary";
317
+ import {
318
+ arpabetToFormat as arpabetToFormat3,
319
+ getFormatJoinSeparator,
320
+ getFormatPreservesCase
321
+ } from "@ingglish/phonemes";
322
+ function capitalize(str) {
323
+ if (str.length === 0) {
324
+ return str;
325
+ }
326
+ return str.charAt(0).toUpperCase() + str.slice(1);
327
+ }
328
+ function isUpperCase(char) {
329
+ return char === char.toUpperCase() && char !== char.toLowerCase();
330
+ }
331
+ var MIN_PART_LENGTH = 3;
332
+ var MIN_PART_FREQUENCY = 500;
333
+ var MAX_PART_LENGTH = 15;
334
+ function dpDecompose(word) {
335
+ const n = word.length;
336
+ const dp = Array.from({
337
+ length: n + 1
338
+ });
339
+ dp[0] = { parts: [], score: 0 };
340
+ for (let i = MIN_PART_LENGTH; i <= n; i++) {
341
+ for (let j = Math.max(0, i - MAX_PART_LENGTH); j <= i - MIN_PART_LENGTH; j++) {
342
+ if (j === 0 && i === n) {
343
+ continue;
344
+ }
345
+ const prev = dp[j];
346
+ if (prev === void 0) {
347
+ continue;
348
+ }
349
+ const chunk = word.slice(j, i);
350
+ const phonemes = lookupWord(chunk);
351
+ if (!phonemes) {
352
+ continue;
353
+ }
354
+ const freq = getWordFrequency(chunk);
355
+ if (freq === void 0 || freq < MIN_PART_FREQUENCY) {
356
+ continue;
357
+ }
358
+ const newScore = prev.score + freq;
359
+ const newParts = prev.parts.length + 1;
360
+ const current = dp[i];
361
+ if (current === void 0 || newParts < current.parts.length || newParts === current.parts.length && newScore > current.score) {
362
+ dp[i] = { parts: [...prev.parts, chunk], score: newScore };
363
+ }
364
+ }
365
+ }
366
+ const result = dp[n];
367
+ if (result === void 0 || result.parts.length < 2) {
368
+ return null;
369
+ }
370
+ return result.parts;
371
+ }
372
+ function translateAsCompound(word, format = "ingglish") {
373
+ const lowerWord = word.toLowerCase();
374
+ if (lowerWord.length < 6) {
375
+ return null;
376
+ }
377
+ const parts = dpDecompose(lowerWord);
378
+ if (!parts) {
379
+ return null;
380
+ }
381
+ const translations = [];
382
+ let pos = 0;
383
+ for (const part of parts) {
384
+ const phonemes = lookupWord(part);
385
+ if (!phonemes) {
386
+ return null;
387
+ }
388
+ let translated = arpabetToFormat3(phonemes, format);
389
+ if (getFormatPreservesCase(format)) {
390
+ const originalPart = word.slice(pos, pos + part.length);
391
+ if (originalPart.length > 0 && isUpperCase(originalPart[0])) {
392
+ translated = capitalize(translated);
393
+ }
394
+ }
395
+ translations.push(translated);
396
+ pos += part.length;
397
+ }
398
+ return translations.join(getFormatJoinSeparator(format));
399
+ }
400
+ function lookupWord(word) {
401
+ return lookupPronunciation2(word);
402
+ }
403
+
404
+ // src/stemming.ts
405
+ import { lookupPronunciation as lookupPronunciation3 } from "@ingglish/dictionary";
406
+ import { arpabetToFormat as arpabetToFormat4, stripStress } from "@ingglish/phonemes";
407
+ var VOICELESS = /* @__PURE__ */ new Set(["CH", "F", "HH", "K", "P", "S", "SH", "T", "TH"]);
408
+ var SIBILANTS = /* @__PURE__ */ new Set(["CH", "JH", "S", "SH", "Z", "ZH"]);
409
+ function selectEdPhonemes(lastPhoneme) {
410
+ const base = stripStress(lastPhoneme);
411
+ if (base === "T" || base === "D") {
412
+ return ["IH0", "D"];
413
+ }
414
+ if (VOICELESS.has(base)) {
415
+ return ["T"];
416
+ }
417
+ return ["D"];
418
+ }
419
+ function selectSPhonemes(lastPhoneme) {
420
+ const base = stripStress(lastPhoneme);
421
+ if (SIBILANTS.has(base)) {
422
+ return ["IH0", "Z"];
423
+ }
424
+ if (VOICELESS.has(base)) {
425
+ return ["S"];
426
+ }
427
+ return ["Z"];
428
+ }
429
+ var INFLECTIONAL_SUFFIXES = /* @__PURE__ */ new Set([
430
+ "ed",
431
+ "er",
432
+ "es",
433
+ "est",
434
+ "ification",
435
+ "ify",
436
+ "ifying",
437
+ "ing",
438
+ "or",
439
+ "s"
440
+ ]);
441
+ var SUFFIX_PHONEMES = [
442
+ // Long suffixes first (must come before shorter matches: -ification before -tion, -ifying before -ing)
443
+ { phonemes: ["IH0", "F", "IH0", "K", "EY1", "SH", "AH0", "N"], suffix: "ification" },
444
+ { phonemes: ["IH0", "F", "AY1", "IH0", "NG"], suffix: "ifying" },
445
+ { phonemes: ["IH0", "F", "AY1"], suffix: "ify" },
446
+ // Verb suffixes
447
+ { phonemes: ["IH0", "NG"], suffix: "ing" },
448
+ { phonemes: null, suffix: "ed" },
449
+ // allomorph: T/D/IH0 D (selected dynamically)
450
+ { phonemes: null, suffix: "es" },
451
+ // allomorph: same as -s (S/Z/IH0 Z based on stem)
452
+ { phonemes: null, suffix: "s" },
453
+ // allomorph: S/Z/IH0 Z (selected dynamically)
454
+ // Noun suffixes
455
+ { phonemes: ["SH", "AH0", "N"], suffix: "tion" },
456
+ { phonemes: ["ZH", "AH0", "N"], suffix: "sion" },
457
+ { phonemes: ["N", "AH0", "S"], suffix: "ness" },
458
+ { phonemes: ["M", "AH0", "N", "T"], suffix: "ment" },
459
+ { phonemes: ["IH0", "T", "IY0"], suffix: "ity" },
460
+ { phonemes: ["ER0"], suffix: "er" },
461
+ { phonemes: ["ER0"], suffix: "or" },
462
+ { phonemes: ["IH0", "S", "T"], suffix: "ist" },
463
+ { phonemes: ["IH0", "Z", "AH0", "M"], suffix: "ism" },
464
+ // Adjective suffixes
465
+ { phonemes: ["L", "IY0"], suffix: "ly" },
466
+ { phonemes: ["F", "AH0", "L"], suffix: "ful" },
467
+ { phonemes: ["L", "AH0", "S"], suffix: "less" },
468
+ { phonemes: ["AH0", "B", "AH0", "L"], suffix: "able" },
469
+ { phonemes: ["AH0", "B", "AH0", "L"], suffix: "ible" },
470
+ { phonemes: ["AH0", "S"], suffix: "ous" },
471
+ { phonemes: ["IH0", "V"], suffix: "ive" },
472
+ { phonemes: ["AH0", "L"], suffix: "al" },
473
+ { phonemes: ["IH0", "K"], suffix: "ic" },
474
+ // Comparative/superlative
475
+ { phonemes: ["AH0", "S", "T"], suffix: "est" },
476
+ // Additional suffixes
477
+ { phonemes: ["AH0", "L", "IY0"], suffix: "ally" },
478
+ { phonemes: ["AA1", "L", "AH0", "JH", "IY0"], suffix: "ology" },
479
+ { phonemes: ["AY1", "Z"], suffix: "ize" },
480
+ { phonemes: ["AY1", "Z"], suffix: "ise" }
481
+ ];
482
+ var PREFIX_PHONEMES = [
483
+ { phonemes: ["AH0", "N"], prefix: "un" },
484
+ { phonemes: ["R", "IY0"], prefix: "re" },
485
+ { phonemes: ["P", "R", "IY0"], prefix: "pre" },
486
+ { phonemes: ["D", "IH0", "S"], prefix: "dis" },
487
+ { phonemes: ["M", "IH0", "S"], prefix: "mis" },
488
+ { phonemes: ["OW1", "V", "ER0"], prefix: "over" },
489
+ { phonemes: ["AH1", "N", "D", "ER0"], prefix: "under" },
490
+ { phonemes: ["AW1", "T"], prefix: "out" },
491
+ { phonemes: ["AE1", "N", "T", "IY0"], prefix: "anti" },
492
+ { phonemes: ["S", "UW1", "P", "ER0"], prefix: "super" }
493
+ ];
494
+ function matchStemming(word) {
495
+ const lowerWord = word.toLowerCase();
496
+ for (const { phonemes: suffixArpabet, suffix } of SUFFIX_PHONEMES) {
497
+ if (lowerWord.endsWith(suffix) && lowerWord.length > suffix.length + 2) {
498
+ const stem = lowerWord.slice(0, -suffix.length);
499
+ for (const variant of getStemVariants(stem, suffix)) {
500
+ const baseArpabet = lookupPronunciation3(variant);
501
+ if (baseArpabet) {
502
+ const resolvedSuffix = resolveSuffixPhonemes(suffix, suffixArpabet, baseArpabet);
503
+ return {
504
+ phonemes: [...baseArpabet, ...resolvedSuffix],
505
+ stem: variant,
506
+ suffix
507
+ };
508
+ }
509
+ }
510
+ }
511
+ }
512
+ for (const { phonemes: prefixArpabet, prefix } of PREFIX_PHONEMES) {
513
+ if (lowerWord.startsWith(prefix) && lowerWord.length > prefix.length + 2) {
514
+ const stem = lowerWord.slice(prefix.length);
515
+ const baseArpabet = lookupPronunciation3(stem);
516
+ if (baseArpabet) {
517
+ return {
518
+ phonemes: [...prefixArpabet, ...baseArpabet],
519
+ prefix,
520
+ stem
521
+ };
522
+ }
523
+ }
524
+ }
525
+ return null;
526
+ }
527
+ function translateWithStemming(word, format = "ingglish") {
528
+ const match = matchStemming(word);
529
+ if (match === null) {
530
+ return null;
531
+ }
532
+ return arpabetToFormat4(match.phonemes, format);
533
+ }
534
+ function getStemVariants(stem, suffix) {
535
+ const variants = [stem];
536
+ if (INFLECTIONAL_SUFFIXES.has(suffix)) {
537
+ variants.push(
538
+ stem + "e",
539
+ // hoping -> hope
540
+ stem.length > 1 ? stem.slice(0, -1) : stem,
541
+ // running -> run (double consonant)
542
+ stem.length > 0 ? stem + stem.at(-1) : stem
543
+ // big -> bigg (for adding -er)
544
+ );
545
+ }
546
+ if (stem.endsWith("i")) {
547
+ variants.push(stem.slice(0, -1) + "y");
548
+ }
549
+ variants.push(stem + "y");
550
+ return variants;
551
+ }
552
+ function resolveSuffixPhonemes(suffix, suffixArpabet, baseArpabet) {
553
+ if (suffixArpabet !== null) {
554
+ return suffixArpabet;
555
+ }
556
+ const lastPhoneme = baseArpabet.at(-1);
557
+ if (suffix === "ed") {
558
+ return selectEdPhonemes(lastPhoneme);
559
+ }
560
+ return selectSPhonemes(lastPhoneme);
561
+ }
562
+
563
+ // src/index.ts
564
+ function diagnoseUnknown(word) {
565
+ if (/(.)\1\1/.test(word) || !/[aeiouy]/i.test(word)) {
566
+ return null;
567
+ }
568
+ const { strategy } = translateUnknownCore(word, "ingglish");
569
+ switch (strategy) {
570
+ case "british": {
571
+ const m = matchBritish(word);
572
+ return { americanSpelling: m.american, phonemes: m.phonemes, strategy: "british" };
573
+ }
574
+ case "compound": {
575
+ return { parts: dpDecompose(word.toLowerCase()), strategy: "compound" };
576
+ }
577
+ case "custom": {
578
+ return { phonemes: getCustomPronunciation(word), strategy: "custom" };
579
+ }
580
+ case "g2p": {
581
+ return { strategy: "g2p", trace: wordToArpabetTraced(word) };
582
+ }
583
+ case "initialism": {
584
+ return { strategy: "initialism" };
585
+ }
586
+ case "stemming": {
587
+ const m = matchStemming(word);
588
+ return { prefix: m.prefix, stem: m.stem, strategy: "stemming", suffix: m.suffix };
589
+ }
590
+ }
591
+ }
592
+ function translateUnknown(word, format = "ingglish") {
593
+ return translateUnknownCore(word, format).translated;
594
+ }
595
+ function translateUnknownCore(word, format) {
596
+ const customPhonemes = getCustomPronunciation(word);
597
+ if (customPhonemes !== void 0) {
598
+ return { strategy: "custom", translated: arpabetToFormat5(customPhonemes, format) };
599
+ }
600
+ if (isInitialism(word)) {
601
+ return { strategy: "initialism", translated: translateAsAcronym(word, format) };
602
+ }
603
+ const britishResult = translateAsBritish(word, format);
604
+ if (britishResult !== null && britishResult.length > 0) {
605
+ return { strategy: "british", translated: britishResult };
606
+ }
607
+ const compoundResult = translateAsCompound(word, format);
608
+ if (compoundResult !== null && compoundResult.length > 0) {
609
+ return { strategy: "compound", translated: compoundResult };
610
+ }
611
+ const stemmedResult = translateWithStemming(word, format);
612
+ if (stemmedResult !== null && stemmedResult.length > 0) {
613
+ return { strategy: "stemming", translated: stemmedResult };
614
+ }
615
+ return { strategy: "g2p", translated: wordToPhonetic(word, format) };
616
+ }
617
+ export {
618
+ KNOWN_INITIALISMS,
619
+ LETTER_PHONEMES,
620
+ diagnoseUnknown,
621
+ isInitialism,
622
+ matchBritish,
623
+ parseInitialismWithSuffix,
624
+ translateAsAcronym,
625
+ translateUnknown
626
+ };
package/package.json ADDED
@@ -0,0 +1,54 @@
1
+ {
2
+ "name": "@ingglish/fallback",
3
+ "version": "0.1.0",
4
+ "description": "Unknown word translation strategies for Ingglish (G2P, stemming, compounds, etc.)",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "module": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "source": "./src/index.ts",
12
+ "import": {
13
+ "types": "./dist/index.d.ts",
14
+ "default": "./dist/index.js"
15
+ },
16
+ "require": {
17
+ "types": "./dist/index.d.cts",
18
+ "default": "./dist/index.cjs"
19
+ }
20
+ }
21
+ },
22
+ "files": [
23
+ "dist"
24
+ ],
25
+ "sideEffects": false,
26
+ "engines": {
27
+ "node": ">=16"
28
+ },
29
+ "scripts": {
30
+ "build": "tsup",
31
+ "build:fast": "tsup src/index.ts --format esm",
32
+ "lint": "eslint --cache src",
33
+ "test": "vitest run --no-color",
34
+ "bench": "vitest bench --no-color",
35
+ "prepublishOnly": "npm run build"
36
+ },
37
+ "dependencies": {
38
+ "@ingglish/phonemes": "^0.1.0",
39
+ "@ingglish/ipa": "^0.1.0",
40
+ "@ingglish/dictionary": "^0.1.0",
41
+ "@ingglish/g2p": "^0.1.0"
42
+ },
43
+ "author": "Paul Tarjan",
44
+ "license": "MIT",
45
+ "repository": {
46
+ "type": "git",
47
+ "url": "git+https://github.com/ptarjan/ingglish.git",
48
+ "directory": "packages/fallback"
49
+ },
50
+ "homepage": "https://github.com/ptarjan/ingglish#readme",
51
+ "bugs": {
52
+ "url": "https://github.com/ptarjan/ingglish/issues"
53
+ }
54
+ }