@qvac/translation-nmtcpp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +470 -0
- package/binding.js +1 -0
- package/index.d.ts +82 -0
- package/index.js +188 -0
- package/lib/error.js +65 -0
- package/marian.js +186 -0
- package/package.json +69 -0
- package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
- package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
- package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
- package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
- package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
- package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
- package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
- package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
- package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
- package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
- package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
- package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
- package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
- package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
- package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
- package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
- package/third-party/indic-processor.js +565 -0
|
@@ -0,0 +1,565 @@
|
|
|
1
|
+
const {
|
|
2
|
+
MosesDetokenizer,
|
|
3
|
+
MosesPunctNormalizer,
|
|
4
|
+
MosesTokenizer
|
|
5
|
+
} = require('./indic-processor-deps/sacremoses')
|
|
6
|
+
|
|
7
|
+
const {
|
|
8
|
+
UnicodeIndicTransliterator,
|
|
9
|
+
IndicNormalizerFactory,
|
|
10
|
+
IndicTokenize,
|
|
11
|
+
IndicDetokenize
|
|
12
|
+
} = require('./indic-processor-deps/indicnlp')
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* JavaScript version of the IndicProcessor class
|
|
16
|
+
* Handles preprocessing and postprocessing of Indic language text
|
|
17
|
+
*/
|
|
18
|
+
class IndicProcessor {
|
|
19
|
+
/**
|
|
20
|
+
* Constructor for IndicProcessor. Initializes all necessary components.
|
|
21
|
+
* @param {boolean} inference - Whether to use inference mode (default: true)
|
|
22
|
+
*/
|
|
23
|
+
constructor (inference = true) {
|
|
24
|
+
this.inference = inference
|
|
25
|
+
|
|
26
|
+
/// ///////////////////////////
|
|
27
|
+
// FLORES -> ISO CODES
|
|
28
|
+
/// ///////////////////////////
|
|
29
|
+
this._floresCodes = {
|
|
30
|
+
asm_Beng: 'as',
|
|
31
|
+
awa_Deva: 'hi',
|
|
32
|
+
ben_Beng: 'bn',
|
|
33
|
+
bho_Deva: 'hi',
|
|
34
|
+
brx_Deva: 'hi',
|
|
35
|
+
doi_Deva: 'hi',
|
|
36
|
+
eng_Latn: 'en',
|
|
37
|
+
gom_Deva: 'kK',
|
|
38
|
+
gon_Deva: 'hi',
|
|
39
|
+
guj_Gujr: 'gu',
|
|
40
|
+
hin_Deva: 'hi',
|
|
41
|
+
hne_Deva: 'hi',
|
|
42
|
+
kan_Knda: 'kn',
|
|
43
|
+
kas_Arab: 'ur',
|
|
44
|
+
kas_Deva: 'hi',
|
|
45
|
+
kha_Latn: 'en',
|
|
46
|
+
lus_Latn: 'en',
|
|
47
|
+
mag_Deva: 'hi',
|
|
48
|
+
mai_Deva: 'hi',
|
|
49
|
+
mal_Mlym: 'ml',
|
|
50
|
+
mar_Deva: 'mr',
|
|
51
|
+
mni_Beng: 'bn',
|
|
52
|
+
mni_Mtei: 'hi',
|
|
53
|
+
npi_Deva: 'ne',
|
|
54
|
+
ory_Orya: 'or',
|
|
55
|
+
pan_Guru: 'pa',
|
|
56
|
+
san_Deva: 'hi',
|
|
57
|
+
sat_Olck: 'or',
|
|
58
|
+
snd_Arab: 'ur',
|
|
59
|
+
snd_Deva: 'hi',
|
|
60
|
+
tam_Taml: 'ta',
|
|
61
|
+
tel_Telu: 'te',
|
|
62
|
+
urd_Arab: 'ur',
|
|
63
|
+
unr_Deva: 'hi'
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/// ///////////////////////////
|
|
67
|
+
// INDIC DIGIT TRANSLATION
|
|
68
|
+
/// ///////////////////////////
|
|
69
|
+
this._digitsTranslationMap = new Map()
|
|
70
|
+
const digitsDict = {
|
|
71
|
+
'\u09e6': '0',
|
|
72
|
+
'\u0ae6': '0',
|
|
73
|
+
'\u0ce6': '0',
|
|
74
|
+
'\u0966': '0',
|
|
75
|
+
'\u0660': '0',
|
|
76
|
+
'\uabf0': '0',
|
|
77
|
+
'\u0b66': '0',
|
|
78
|
+
'\u0a66': '0',
|
|
79
|
+
'\u1c50': '0',
|
|
80
|
+
'\u06f0': '0',
|
|
81
|
+
|
|
82
|
+
'\u09e7': '1',
|
|
83
|
+
'\u0ae7': '1',
|
|
84
|
+
'\u0967': '1',
|
|
85
|
+
'\u0ce7': '1',
|
|
86
|
+
'\u06f1': '1',
|
|
87
|
+
'\uabf1': '1',
|
|
88
|
+
'\u0b67': '1',
|
|
89
|
+
'\u0a67': '1',
|
|
90
|
+
'\u1c51': '1',
|
|
91
|
+
'\u0c67': '1',
|
|
92
|
+
|
|
93
|
+
'\u09e8': '2',
|
|
94
|
+
'\u0ae8': '2',
|
|
95
|
+
'\u0968': '2',
|
|
96
|
+
'\u0ce8': '2',
|
|
97
|
+
'\u06f2': '2',
|
|
98
|
+
'\uabf2': '2',
|
|
99
|
+
'\u0b68': '2',
|
|
100
|
+
'\u0a68': '2',
|
|
101
|
+
'\u1c52': '2',
|
|
102
|
+
'\u0c68': '2',
|
|
103
|
+
|
|
104
|
+
'\u09e9': '3',
|
|
105
|
+
'\u0ae9': '3',
|
|
106
|
+
'\u0969': '3',
|
|
107
|
+
'\u0ce9': '3',
|
|
108
|
+
'\u06f3': '3',
|
|
109
|
+
'\uabf3': '3',
|
|
110
|
+
'\u0b69': '3',
|
|
111
|
+
'\u0a69': '3',
|
|
112
|
+
'\u1c53': '3',
|
|
113
|
+
'\u0c69': '3',
|
|
114
|
+
|
|
115
|
+
'\u09ea': '4',
|
|
116
|
+
'\u0aea': '4',
|
|
117
|
+
'\u096a': '4',
|
|
118
|
+
'\u0cea': '4',
|
|
119
|
+
'\u06f4': '4',
|
|
120
|
+
'\uabf4': '4',
|
|
121
|
+
'\u0b6a': '4',
|
|
122
|
+
'\u0a6a': '4',
|
|
123
|
+
'\u1c54': '4',
|
|
124
|
+
'\u0c6a': '4',
|
|
125
|
+
|
|
126
|
+
'\u09eb': '5',
|
|
127
|
+
'\u0aeb': '5',
|
|
128
|
+
'\u096b': '5',
|
|
129
|
+
'\u0ceb': '5',
|
|
130
|
+
'\u06f5': '5',
|
|
131
|
+
'\uabf5': '5',
|
|
132
|
+
'\u0b6b': '5',
|
|
133
|
+
'\u0a6b': '5',
|
|
134
|
+
'\u1c55': '5',
|
|
135
|
+
'\u0c6b': '5',
|
|
136
|
+
|
|
137
|
+
'\u09ec': '6',
|
|
138
|
+
'\u0aec': '6',
|
|
139
|
+
'\u096c': '6',
|
|
140
|
+
'\u0cec': '6',
|
|
141
|
+
'\u06f6': '6',
|
|
142
|
+
'\uabf6': '6',
|
|
143
|
+
'\u0b6c': '6',
|
|
144
|
+
'\u0a6c': '6',
|
|
145
|
+
'\u1c56': '6',
|
|
146
|
+
'\u0c6c': '6',
|
|
147
|
+
|
|
148
|
+
'\u09ed': '7',
|
|
149
|
+
'\u0aed': '7',
|
|
150
|
+
'\u096d': '7',
|
|
151
|
+
'\u0ced': '7',
|
|
152
|
+
'\u06f7': '7',
|
|
153
|
+
'\uabf7': '7',
|
|
154
|
+
'\u0b6d': '7',
|
|
155
|
+
'\u0a6d': '7',
|
|
156
|
+
'\u1c57': '7',
|
|
157
|
+
'\u0c6d': '7',
|
|
158
|
+
|
|
159
|
+
'\u09ee': '8',
|
|
160
|
+
'\u0aee': '8',
|
|
161
|
+
'\u096e': '8',
|
|
162
|
+
'\u0cee': '8',
|
|
163
|
+
'\u06f8': '8',
|
|
164
|
+
'\uabf8': '8',
|
|
165
|
+
'\u0b6e': '8',
|
|
166
|
+
'\u0a6e': '8',
|
|
167
|
+
'\u1c58': '8',
|
|
168
|
+
'\u0c6e': '8',
|
|
169
|
+
|
|
170
|
+
'\u09ef': '9',
|
|
171
|
+
'\u0aef': '9',
|
|
172
|
+
'\u096f': '9',
|
|
173
|
+
'\u0cef': '9',
|
|
174
|
+
'\u06f9': '9',
|
|
175
|
+
'\uabf9': '9',
|
|
176
|
+
'\u0b6f': '9',
|
|
177
|
+
'\u0a6f': '9',
|
|
178
|
+
'\u1c59': '9',
|
|
179
|
+
'\u0c6f': '9'
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
for (const [k, v] of Object.entries(digitsDict)) {
|
|
183
|
+
this._digitsTranslationMap.set(k, v)
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Also map ASCII '0'-'9'
|
|
187
|
+
for (let c = '0'.charCodeAt(0); c <= '9'.charCodeAt(0); c++) {
|
|
188
|
+
this._digitsTranslationMap.set(
|
|
189
|
+
String.fromCharCode(c),
|
|
190
|
+
String.fromCharCode(c)
|
|
191
|
+
)
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/// ///////////////////////////
|
|
195
|
+
// PLACEHOLDER MAP QUEUE
|
|
196
|
+
/// ///////////////////////////
|
|
197
|
+
this._placeholderEntityMaps = []
|
|
198
|
+
|
|
199
|
+
/// ///////////////////////////
|
|
200
|
+
// Dependency Imports
|
|
201
|
+
// Note: In a real implementation, these would be imported from their respective modules
|
|
202
|
+
/// ///////////////////////////
|
|
203
|
+
this._enTok = new MosesTokenizer('en')
|
|
204
|
+
this._enNormalizer = new MosesPunctNormalizer('en')
|
|
205
|
+
this._enDetok = new MosesDetokenizer('en')
|
|
206
|
+
this._xliterator = UnicodeIndicTransliterator
|
|
207
|
+
|
|
208
|
+
// These would normally be imported from indicnlp
|
|
209
|
+
this._indicTokenize = IndicTokenize
|
|
210
|
+
this._indicDetokenize = IndicDetokenize
|
|
211
|
+
this._indicNormalizerFactory = IndicNormalizerFactory
|
|
212
|
+
|
|
213
|
+
/// ///////////////////////////
|
|
214
|
+
// Precompiled Patterns
|
|
215
|
+
/// ///////////////////////////
|
|
216
|
+
this._MULTISPACE_REGEX = /[ ]{2,}/g
|
|
217
|
+
this._DIGIT_SPACE_PERCENT = /(\d) %/g
|
|
218
|
+
this._DOUBLE_QUOT_PUNC = /"([,.]+)/g
|
|
219
|
+
this._DIGIT_NBSP_DIGIT = /(\d) (\d)/g
|
|
220
|
+
this._END_BRACKET_SPACE_PUNC_REGEX = /\) ([.!:?;,])/g
|
|
221
|
+
|
|
222
|
+
this._URL_PATTERN =
|
|
223
|
+
/\b(?<![\w/.])(?:(?:https?|ftp):\/\/)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b/g
|
|
224
|
+
this._NUMERAL_PATTERN =
|
|
225
|
+
/(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-/.,:']\d+[-/.,:'+]\d+(?:\.\d+)?|\d+[-/.:'+]\d+(?:\.\d+)?)/g
|
|
226
|
+
this._EMAIL_PATTERN = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}/g
|
|
227
|
+
this._OTHER_PATTERN = /[A-Za-z0-9]*[#|@]\w+/g
|
|
228
|
+
|
|
229
|
+
// Combined punctuation replacements
|
|
230
|
+
this._PUNC_REPLACEMENTS = [
|
|
231
|
+
[/\r/g, ''],
|
|
232
|
+
[/\(\s*/g, '('],
|
|
233
|
+
[/\s*\)/g, ')'],
|
|
234
|
+
[/\s:\s?/g, ':'],
|
|
235
|
+
[/\s;\s?/g, ';'],
|
|
236
|
+
[/[`´'‚']/g, "'"],
|
|
237
|
+
[/[„""«»]/g, '"'],
|
|
238
|
+
[/[–—]/g, '-'],
|
|
239
|
+
[/\.\.\./g, '...'],
|
|
240
|
+
[/ %/g, '%'],
|
|
241
|
+
[/nº /g, 'nº '],
|
|
242
|
+
[/ ºC/g, ' ºC'],
|
|
243
|
+
[/ [?!;]/g, (m) => m[0].trim()],
|
|
244
|
+
[/, /g, ', ']
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
this._INDIC_FAILURE_CASES = [
|
|
248
|
+
'آی ڈی ',
|
|
249
|
+
'ꯑꯥꯏꯗꯤ',
|
|
250
|
+
'आईडी',
|
|
251
|
+
'आई . डी . ',
|
|
252
|
+
'आई . डी .',
|
|
253
|
+
'आई. डी. ',
|
|
254
|
+
'आई. डी.',
|
|
255
|
+
'आय. डी. ',
|
|
256
|
+
'आय. डी.',
|
|
257
|
+
'आय . डी . ',
|
|
258
|
+
'आय . डी .',
|
|
259
|
+
'ऐटि',
|
|
260
|
+
'آئی ڈی ',
|
|
261
|
+
'ᱟᱭᱰᱤ ᱾',
|
|
262
|
+
'आयडी',
|
|
263
|
+
'ऐडि',
|
|
264
|
+
'आइडि',
|
|
265
|
+
'ᱟᱭᱰᱤ'
|
|
266
|
+
]
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Apply punctuation replacements to text
|
|
271
|
+
* @private
|
|
272
|
+
* @param {string} text - Text to process
|
|
273
|
+
* @param {Array} replacements - Array of [pattern, replacement] pairs
|
|
274
|
+
* @returns {string} - Processed text
|
|
275
|
+
*/
|
|
276
|
+
_applyPuncReplacements (text, replacements) {
|
|
277
|
+
for (const [pattern, replacement] of replacements) {
|
|
278
|
+
text = text.replace(pattern, replacement)
|
|
279
|
+
}
|
|
280
|
+
return text
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Normalize punctuation in text
|
|
285
|
+
* @private
|
|
286
|
+
* @param {string} text - Text to normalize
|
|
287
|
+
* @returns {string} - Normalized text
|
|
288
|
+
*/
|
|
289
|
+
_puncNorm (text) {
|
|
290
|
+
// 1) Apply replacements
|
|
291
|
+
text = this._applyPuncReplacements(text, this._PUNC_REPLACEMENTS)
|
|
292
|
+
|
|
293
|
+
// 2) Additional patterns
|
|
294
|
+
text = text.replace(this._MULTISPACE_REGEX, ' ')
|
|
295
|
+
text = text.replace(this._END_BRACKET_SPACE_PUNC_REGEX, ')$1')
|
|
296
|
+
text = text.replace(this._DIGIT_SPACE_PERCENT, '$1%')
|
|
297
|
+
text = text.replace(this._DOUBLE_QUOT_PUNC, '$1"')
|
|
298
|
+
text = text.replace(this._DIGIT_NBSP_DIGIT, '$1.$2')
|
|
299
|
+
return text.trim()
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/**
|
|
303
|
+
* Wrap substrings with matched patterns in the text with placeholders
|
|
304
|
+
* @private
|
|
305
|
+
* @param {string} text - Text to process
|
|
306
|
+
* @returns {string} - Text with placeholders
|
|
307
|
+
*/
|
|
308
|
+
_wrapWithPlaceholders (text) {
|
|
309
|
+
let serialNo = 1
|
|
310
|
+
const placeholderEntityMap = {}
|
|
311
|
+
const patterns = [
|
|
312
|
+
this._EMAIL_PATTERN,
|
|
313
|
+
this._URL_PATTERN,
|
|
314
|
+
this._NUMERAL_PATTERN,
|
|
315
|
+
this._OTHER_PATTERN
|
|
316
|
+
]
|
|
317
|
+
|
|
318
|
+
for (const pattern of patterns) {
|
|
319
|
+
// Reset lastIndex to ensure we find all matches
|
|
320
|
+
pattern.lastIndex = 0
|
|
321
|
+
|
|
322
|
+
// Find all matches of this pattern
|
|
323
|
+
const matches = new Set()
|
|
324
|
+
let match
|
|
325
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
326
|
+
matches.add(match[0])
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
for (const match of matches) {
|
|
330
|
+
// Additional checks
|
|
331
|
+
if (pattern === this._URL_PATTERN) {
|
|
332
|
+
if (match.replace(/\./g, '').length < 4) {
|
|
333
|
+
continue
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
if (pattern === this._NUMERAL_PATTERN) {
|
|
337
|
+
if (
|
|
338
|
+
match.replace(/\s/g, '').replace(/\./g, '').replace(/:/g, '')
|
|
339
|
+
.length < 4
|
|
340
|
+
) {
|
|
341
|
+
continue
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
const basePlaceholder = `<ID${serialNo}>`
|
|
346
|
+
// Map various placeholder formats to the matched text
|
|
347
|
+
placeholderEntityMap[`<ID${serialNo}>`] = match
|
|
348
|
+
placeholderEntityMap[`< ID${serialNo} >`] = match
|
|
349
|
+
placeholderEntityMap[`[ID${serialNo}]`] = match
|
|
350
|
+
placeholderEntityMap[`[ ID${serialNo} ]`] = match
|
|
351
|
+
placeholderEntityMap[`[ID ${serialNo}]`] = match
|
|
352
|
+
placeholderEntityMap[`<ID${serialNo}]`] = match
|
|
353
|
+
placeholderEntityMap[`< ID${serialNo}]`] = match
|
|
354
|
+
placeholderEntityMap[`<ID${serialNo} ]`] = match
|
|
355
|
+
|
|
356
|
+
// Handle Indic failure cases
|
|
357
|
+
for (const indicCase of this._INDIC_FAILURE_CASES) {
|
|
358
|
+
placeholderEntityMap[`<${indicCase}${serialNo}>`] = match
|
|
359
|
+
placeholderEntityMap[`< ${indicCase}${serialNo} >`] = match
|
|
360
|
+
placeholderEntityMap[`< ${indicCase} ${serialNo} >`] = match
|
|
361
|
+
placeholderEntityMap[`<${indicCase} ${serialNo}]`] = match
|
|
362
|
+
placeholderEntityMap[`< ${indicCase} ${serialNo} ]`] = match
|
|
363
|
+
placeholderEntityMap[`[${indicCase}${serialNo}]`] = match
|
|
364
|
+
placeholderEntityMap[`[${indicCase} ${serialNo}]`] = match
|
|
365
|
+
placeholderEntityMap[`[ ${indicCase}${serialNo} ]`] = match
|
|
366
|
+
placeholderEntityMap[`[ ${indicCase} ${serialNo} ]`] = match
|
|
367
|
+
placeholderEntityMap[`${indicCase} ${serialNo}`] = match
|
|
368
|
+
placeholderEntityMap[`${indicCase}${serialNo}`] = match
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
// Replace the match with the base placeholder
|
|
372
|
+
text = text.replace(match, basePlaceholder)
|
|
373
|
+
serialNo += 1
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// Clean up any remaining placeholder artifacts
|
|
378
|
+
text = text.replace(/\s+/g, ' ').replace('>/', '>').replace(']/', ']')
|
|
379
|
+
this._placeholderEntityMaps.push(placeholderEntityMap)
|
|
380
|
+
return text
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Normalize text by translating numerals and optionally wrapping placeholders
|
|
385
|
+
* @private
|
|
386
|
+
* @param {string} text - Text to normalize
|
|
387
|
+
* @returns {string} - Normalized text
|
|
388
|
+
*/
|
|
389
|
+
_normalize (text) {
|
|
390
|
+
// Translate digits to Latin numerals
|
|
391
|
+
let normalizedText = ''
|
|
392
|
+
for (const char of text) {
|
|
393
|
+
normalizedText += this._digitsTranslationMap.get(char) || char
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
if (this.inference) {
|
|
397
|
+
normalizedText = this._wrapWithPlaceholders(normalizedText)
|
|
398
|
+
}
|
|
399
|
+
return normalizedText
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Helper method: normalizes, tokenizes, optionally transliterates from iso_lang -> 'hi'
|
|
404
|
+
* @private
|
|
405
|
+
* @param {string} sentence - Input sentence
|
|
406
|
+
* @param {Object} normalizer - Language normalizer
|
|
407
|
+
* @param {string} isoLang - ISO language code
|
|
408
|
+
* @param {boolean} transliterate - Whether to transliterate
|
|
409
|
+
* @returns {string} - Processed text
|
|
410
|
+
*/
|
|
411
|
+
_doIndicTokenizeAndTransliterate (
|
|
412
|
+
sentence,
|
|
413
|
+
normalizer,
|
|
414
|
+
isoLang,
|
|
415
|
+
transliterate
|
|
416
|
+
) {
|
|
417
|
+
const normed = normalizer.normalize(sentence.trim())
|
|
418
|
+
const tokens = this._indicTokenize.trivialTokenize(normed, isoLang)
|
|
419
|
+
const joined = tokens.join(' ')
|
|
420
|
+
|
|
421
|
+
if (!transliterate) {
|
|
422
|
+
return joined
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
const xlated = this._xliterator.transliterate(joined, isoLang, 'hi')
|
|
426
|
+
return xlated.replace(' ् ', '्')
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
/**
|
|
430
|
+
* Preprocess a single sentence
|
|
431
|
+
* @private
|
|
432
|
+
* @param {string} sent - Input sentence
|
|
433
|
+
* @param {string} srcLang - Source language code
|
|
434
|
+
* @param {string} tgtLang - Target language code
|
|
435
|
+
* @param {Object} normalizer - Language normalizer
|
|
436
|
+
* @param {boolean} isTarget - Whether this is a target sentence
|
|
437
|
+
* @returns {string} - Preprocessed sentence
|
|
438
|
+
*/
|
|
439
|
+
_preprocess (sent, srcLang, tgtLang, normalizer, isTarget) {
|
|
440
|
+
const isoLang = this._floresCodes[srcLang] || 'hi'
|
|
441
|
+
const scriptPart = srcLang.split('_')[1]
|
|
442
|
+
let doTransliterate = true
|
|
443
|
+
|
|
444
|
+
// 1) Punctuation normalization
|
|
445
|
+
sent = this._puncNorm(sent)
|
|
446
|
+
|
|
447
|
+
// 2) Numerals & placeholders
|
|
448
|
+
sent = this._normalize(sent)
|
|
449
|
+
|
|
450
|
+
if (['Arab', 'Aran', 'Olck', 'Mtei', 'Latn'].includes(scriptPart)) {
|
|
451
|
+
doTransliterate = false
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
let processedSent
|
|
455
|
+
if (isoLang === 'en') {
|
|
456
|
+
// English path
|
|
457
|
+
const eStrip = sent.trim()
|
|
458
|
+
const eNorm = this._enNormalizer.normalize(eStrip)
|
|
459
|
+
const eTokens = this._enTok.tokenize(eNorm, false, false, false)
|
|
460
|
+
processedSent = eTokens.join(' ')
|
|
461
|
+
} else {
|
|
462
|
+
// Indic path
|
|
463
|
+
processedSent = this._doIndicTokenizeAndTransliterate(
|
|
464
|
+
sent,
|
|
465
|
+
normalizer,
|
|
466
|
+
isoLang,
|
|
467
|
+
doTransliterate
|
|
468
|
+
)
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
processedSent = processedSent.trim()
|
|
472
|
+
if (!isTarget) {
|
|
473
|
+
return `${srcLang} ${tgtLang} ${processedSent}`
|
|
474
|
+
} else {
|
|
475
|
+
return processedSent
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
/**
|
|
480
|
+
* Postprocess a single sentence
|
|
481
|
+
* @private
|
|
482
|
+
* @param {string|Array} sent - Input sentence or array with sentence
|
|
483
|
+
* @param {string} lang - Language code
|
|
484
|
+
* @returns {string} - Postprocessed sentence
|
|
485
|
+
*/
|
|
486
|
+
_postprocess (sent, lang) {
|
|
487
|
+
// Unwrap if sent is a tuple or list
|
|
488
|
+
if (Array.isArray(sent)) {
|
|
489
|
+
sent = sent[0]
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
const placeholderEntityMap = this._placeholderEntityMaps.length ? this._placeholderEntityMaps[0] : undefined
|
|
493
|
+
const [langCode, scriptCode] = lang.split('_', 2)
|
|
494
|
+
const isoLang = this._floresCodes[lang] || 'hi'
|
|
495
|
+
|
|
496
|
+
// Fix for Perso-Arabic scripts
|
|
497
|
+
if (['Arab', 'Aran'].includes(scriptCode)) {
|
|
498
|
+
sent = sent
|
|
499
|
+
.replace(' ؟', '؟')
|
|
500
|
+
.replace(' ۔', '۔')
|
|
501
|
+
.replace(' ،', '،')
|
|
502
|
+
.replace('ٮ۪', 'ؠ')
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// Oriya fix
|
|
506
|
+
if (langCode === 'ory') {
|
|
507
|
+
sent = sent.replace('ଯ଼', 'ୟ')
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
// Restore placeholders
|
|
511
|
+
if (placeholderEntityMap) {
|
|
512
|
+
for (const [k, v] of Object.entries(placeholderEntityMap)) {
|
|
513
|
+
sent = sent.replace(k, v)
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Detokenize
|
|
518
|
+
if (lang === 'eng_Latn') {
|
|
519
|
+
return this._enDetok.detokenize(sent.split(' '))
|
|
520
|
+
} else {
|
|
521
|
+
const xlated = this._xliterator.transliterate(sent, 'hi', isoLang)
|
|
522
|
+
return this._indicDetokenize.trivialDetokenize(xlated, isoLang)
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
/**
|
|
527
|
+
* Preprocess a batch of sentences (normalize, tokenize, transliterate)
|
|
528
|
+
* @public
|
|
529
|
+
* @param {Array<string>} batch - Array of sentences
|
|
530
|
+
* @param {string} srcLang - Source language code
|
|
531
|
+
* @param {string} tgtLang - Target language code (optional)
|
|
532
|
+
* @param {boolean} isTarget - Whether these are target sentences
|
|
533
|
+
* @returns {Array<string>} - Preprocessed sentences
|
|
534
|
+
*/
|
|
535
|
+
preprocessBatch (
|
|
536
|
+
batch,
|
|
537
|
+
srcLang,
|
|
538
|
+
tgtLang = 'hin_Deva',
|
|
539
|
+
isTarget = false
|
|
540
|
+
) {
|
|
541
|
+
let normalizer = null
|
|
542
|
+
const isoCode = this._floresCodes[srcLang] || 'hi'
|
|
543
|
+
|
|
544
|
+
if (srcLang !== 'eng_Latn') {
|
|
545
|
+
normalizer = this._indicNormalizerFactory.getNormalizer(isoCode)
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
return batch.map((s) =>
|
|
549
|
+
this._preprocess(s, srcLang, tgtLang, normalizer, isTarget)
|
|
550
|
+
)
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
/**
|
|
554
|
+
* Postprocess a batch of sentences
|
|
555
|
+
* @public
|
|
556
|
+
* @param {Array<string>} sents - Array of sentences
|
|
557
|
+
* @param {string} lang - Language code
|
|
558
|
+
* @returns {Array<string>} - Postprocessed sentences
|
|
559
|
+
*/
|
|
560
|
+
postprocessBatch (sents, lang = 'hin_Deva') {
|
|
561
|
+
return sents.map((s) => this._postprocess(s, lang))
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
module.exports = { IndicProcessor }
|