@ingglish/fallback 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +654 -0
- package/dist/index.d.cts +119 -0
- package/dist/index.d.ts +119 -0
- package/dist/index.js +626 -0
- package/package.json +54 -0
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { G2PTrace } from '@ingglish/g2p';
|
|
2
|
+
import { OutputFormat } from '@ingglish/phonemes';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Acronym and initialism handling.
|
|
6
|
+
*
|
|
7
|
+
* Handles words that should be spelled out letter-by-letter
|
|
8
|
+
* (e.g., URL -> "you-are-ell") vs acronyms pronounced as words
|
|
9
|
+
* (e.g., NASA -> "nasa").
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Phonemes for individual letters (for spelling out acronyms).
|
|
14
|
+
* Based on how native English speakers pronounce alphabet letters.
|
|
15
|
+
*/
|
|
16
|
+
declare const LETTER_PHONEMES: Record<string, string[]>;
|
|
17
|
+
/**
|
|
18
|
+
* Known initialisms - derived from INITIALISM_EXPANSIONS for consistency.
|
|
19
|
+
* These are pronounced as individual letters, NOT as words.
|
|
20
|
+
*
|
|
21
|
+
* Excludes acronyms pronounced as words like:
|
|
22
|
+
* - RAM (ram), ROM (rom), GIF (gif/jif), JPEG (jay-peg)
|
|
23
|
+
* - JSON (jason), SQL (sequel), NASA, NATO, SCUBA, LASER
|
|
24
|
+
*/
|
|
25
|
+
declare const KNOWN_INITIALISMS: Set<string>;
|
|
26
|
+
/**
|
|
27
|
+
* Checks if a word should be spelled out as individual letters (initialism).
|
|
28
|
+
* Only returns true for known initialisms - unknown uppercase words are NOT
|
|
29
|
+
* automatically treated as initialisms since they might be acronyms pronounced
|
|
30
|
+
* as words (like NASA, GIF, etc).
|
|
31
|
+
*/
|
|
32
|
+
declare function isInitialism(word: string): boolean;
|
|
33
|
+
/**
|
|
34
|
+
* Checks if a word is an initialism with a suffix (e.g., "IDs", "TVs", "URLs", "API's").
|
|
35
|
+
* Returns the base initialism and suffix if matched, null otherwise.
|
|
36
|
+
*/
|
|
37
|
+
declare function parseInitialismWithSuffix(word: string): null | {
|
|
38
|
+
base: string;
|
|
39
|
+
suffix: string;
|
|
40
|
+
};
|
|
41
|
+
/**
|
|
42
|
+
* Translates a word by spelling out each letter.
|
|
43
|
+
* Used for acronyms like URL, HTML, API.
|
|
44
|
+
*/
|
|
45
|
+
declare function translateAsAcronym(word: string, format?: OutputFormat): string;
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* British-to-American spelling conversion.
|
|
49
|
+
*
|
|
50
|
+
* Converts common British English spellings to their American
|
|
51
|
+
* equivalents and looks them up in the CMU dictionary.
|
|
52
|
+
* Used as a fallback when the British spelling isn't found.
|
|
53
|
+
*/
|
|
54
|
+
|
|
55
|
+
interface BritishMatch {
|
|
56
|
+
american: string;
|
|
57
|
+
phonemes: string[];
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Matches a word against British-to-American spelling rules.
|
|
61
|
+
* Returns the American spelling and its phonemes, or null if no match.
|
|
62
|
+
*/
|
|
63
|
+
declare function matchBritish(word: string): BritishMatch | null;
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Fallback strategies for translating unknown words.
|
|
67
|
+
*
|
|
68
|
+
* Called when a word isn't in the dictionary. Each strategy ultimately
|
|
69
|
+
* produces ARPAbet `string[]` (the pipeline IR), which is then passed
|
|
70
|
+
* through `arpabetToFormat()` for final output.
|
|
71
|
+
*
|
|
72
|
+
* Strategy order:
|
|
73
|
+
* 1. Custom pronunciations (tech terms, brand names)
|
|
74
|
+
* 2. Initialisms (spell out letters: URL -> you-are-ell)
|
|
75
|
+
* 3. British spelling normalization (colour -> color)
|
|
76
|
+
* 4. Compound word splitting (github -> git + hub)
|
|
77
|
+
* 5. Stemming (find known base word + known suffix)
|
|
78
|
+
* 6. Rule-based grapheme-to-phoneme
|
|
79
|
+
*/
|
|
80
|
+
|
|
81
|
+
type WordDiagnosis = {
|
|
82
|
+
americanSpelling: string;
|
|
83
|
+
phonemes: string[];
|
|
84
|
+
strategy: 'british';
|
|
85
|
+
} | {
|
|
86
|
+
parts: string[];
|
|
87
|
+
strategy: 'compound';
|
|
88
|
+
} | {
|
|
89
|
+
phonemes: string[];
|
|
90
|
+
strategy: 'custom';
|
|
91
|
+
} | {
|
|
92
|
+
prefix?: string;
|
|
93
|
+
stem: string;
|
|
94
|
+
strategy: 'stemming';
|
|
95
|
+
suffix?: string;
|
|
96
|
+
} | {
|
|
97
|
+
strategy: 'g2p';
|
|
98
|
+
trace: G2PTrace;
|
|
99
|
+
} | {
|
|
100
|
+
strategy: 'initialism';
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Diagnoses an unknown word by determining which fallback strategy handles it
|
|
105
|
+
* and collecting diagnostic data for display.
|
|
106
|
+
*
|
|
107
|
+
* Returns null for obvious non-words (3+ repeated chars, no vowels).
|
|
108
|
+
*/
|
|
109
|
+
declare function diagnoseUnknown(word: string): null | WordDiagnosis;
|
|
110
|
+
/**
|
|
111
|
+
* Attempts all strategies to translate an unknown word.
|
|
112
|
+
*
|
|
113
|
+
* @param word The unknown word
|
|
114
|
+
* @param format The output format
|
|
115
|
+
* @returns The translated word
|
|
116
|
+
*/
|
|
117
|
+
declare function translateUnknown(word: string, format?: OutputFormat): string;
|
|
118
|
+
|
|
119
|
+
export { KNOWN_INITIALISMS, LETTER_PHONEMES, type WordDiagnosis, diagnoseUnknown, isInitialism, matchBritish, parseInitialismWithSuffix, translateAsAcronym, translateUnknown };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { G2PTrace } from '@ingglish/g2p';
|
|
2
|
+
import { OutputFormat } from '@ingglish/phonemes';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Acronym and initialism handling.
|
|
6
|
+
*
|
|
7
|
+
* Handles words that should be spelled out letter-by-letter
|
|
8
|
+
* (e.g., URL -> "you-are-ell") vs acronyms pronounced as words
|
|
9
|
+
* (e.g., NASA -> "nasa").
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Phonemes for individual letters (for spelling out acronyms).
|
|
14
|
+
* Based on how native English speakers pronounce alphabet letters.
|
|
15
|
+
*/
|
|
16
|
+
declare const LETTER_PHONEMES: Record<string, string[]>;
|
|
17
|
+
/**
|
|
18
|
+
* Known initialisms - derived from INITIALISM_EXPANSIONS for consistency.
|
|
19
|
+
* These are pronounced as individual letters, NOT as words.
|
|
20
|
+
*
|
|
21
|
+
* Excludes acronyms pronounced as words like:
|
|
22
|
+
* - RAM (ram), ROM (rom), GIF (gif/jif), JPEG (jay-peg)
|
|
23
|
+
* - JSON (jason), SQL (sequel), NASA, NATO, SCUBA, LASER
|
|
24
|
+
*/
|
|
25
|
+
declare const KNOWN_INITIALISMS: Set<string>;
|
|
26
|
+
/**
|
|
27
|
+
* Checks if a word should be spelled out as individual letters (initialism).
|
|
28
|
+
* Only returns true for known initialisms - unknown uppercase words are NOT
|
|
29
|
+
* automatically treated as initialisms since they might be acronyms pronounced
|
|
30
|
+
* as words (like NASA, GIF, etc).
|
|
31
|
+
*/
|
|
32
|
+
declare function isInitialism(word: string): boolean;
|
|
33
|
+
/**
|
|
34
|
+
* Checks if a word is an initialism with a suffix (e.g., "IDs", "TVs", "URLs", "API's").
|
|
35
|
+
* Returns the base initialism and suffix if matched, null otherwise.
|
|
36
|
+
*/
|
|
37
|
+
declare function parseInitialismWithSuffix(word: string): null | {
|
|
38
|
+
base: string;
|
|
39
|
+
suffix: string;
|
|
40
|
+
};
|
|
41
|
+
/**
|
|
42
|
+
* Translates a word by spelling out each letter.
|
|
43
|
+
* Used for acronyms like URL, HTML, API.
|
|
44
|
+
*/
|
|
45
|
+
declare function translateAsAcronym(word: string, format?: OutputFormat): string;
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* British-to-American spelling conversion.
|
|
49
|
+
*
|
|
50
|
+
* Converts common British English spellings to their American
|
|
51
|
+
* equivalents and looks them up in the CMU dictionary.
|
|
52
|
+
* Used as a fallback when the British spelling isn't found.
|
|
53
|
+
*/
|
|
54
|
+
|
|
55
|
+
interface BritishMatch {
|
|
56
|
+
american: string;
|
|
57
|
+
phonemes: string[];
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Matches a word against British-to-American spelling rules.
|
|
61
|
+
* Returns the American spelling and its phonemes, or null if no match.
|
|
62
|
+
*/
|
|
63
|
+
declare function matchBritish(word: string): BritishMatch | null;
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Fallback strategies for translating unknown words.
|
|
67
|
+
*
|
|
68
|
+
* Called when a word isn't in the dictionary. Each strategy ultimately
|
|
69
|
+
* produces ARPAbet `string[]` (the pipeline IR), which is then passed
|
|
70
|
+
* through `arpabetToFormat()` for final output.
|
|
71
|
+
*
|
|
72
|
+
* Strategy order:
|
|
73
|
+
* 1. Custom pronunciations (tech terms, brand names)
|
|
74
|
+
* 2. Initialisms (spell out letters: URL -> you-are-ell)
|
|
75
|
+
* 3. British spelling normalization (colour -> color)
|
|
76
|
+
* 4. Compound word splitting (github -> git + hub)
|
|
77
|
+
* 5. Stemming (find known base word + known suffix)
|
|
78
|
+
* 6. Rule-based grapheme-to-phoneme
|
|
79
|
+
*/
|
|
80
|
+
|
|
81
|
+
type WordDiagnosis = {
|
|
82
|
+
americanSpelling: string;
|
|
83
|
+
phonemes: string[];
|
|
84
|
+
strategy: 'british';
|
|
85
|
+
} | {
|
|
86
|
+
parts: string[];
|
|
87
|
+
strategy: 'compound';
|
|
88
|
+
} | {
|
|
89
|
+
phonemes: string[];
|
|
90
|
+
strategy: 'custom';
|
|
91
|
+
} | {
|
|
92
|
+
prefix?: string;
|
|
93
|
+
stem: string;
|
|
94
|
+
strategy: 'stemming';
|
|
95
|
+
suffix?: string;
|
|
96
|
+
} | {
|
|
97
|
+
strategy: 'g2p';
|
|
98
|
+
trace: G2PTrace;
|
|
99
|
+
} | {
|
|
100
|
+
strategy: 'initialism';
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Diagnoses an unknown word by determining which fallback strategy handles it
|
|
105
|
+
* and collecting diagnostic data for display.
|
|
106
|
+
*
|
|
107
|
+
* Returns null for obvious non-words (3+ repeated chars, no vowels).
|
|
108
|
+
*/
|
|
109
|
+
declare function diagnoseUnknown(word: string): null | WordDiagnosis;
|
|
110
|
+
/**
|
|
111
|
+
* Attempts all strategies to translate an unknown word.
|
|
112
|
+
*
|
|
113
|
+
* @param word The unknown word
|
|
114
|
+
* @param format The output format
|
|
115
|
+
* @returns The translated word
|
|
116
|
+
*/
|
|
117
|
+
declare function translateUnknown(word: string, format?: OutputFormat): string;
|
|
118
|
+
|
|
119
|
+
export { KNOWN_INITIALISMS, LETTER_PHONEMES, type WordDiagnosis, diagnoseUnknown, isInitialism, matchBritish, parseInitialismWithSuffix, translateAsAcronym, translateUnknown };
|