@qvac/translation-nmtcpp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +470 -0
  3. package/binding.js +1 -0
  4. package/index.d.ts +82 -0
  5. package/index.js +188 -0
  6. package/lib/error.js +65 -0
  7. package/marian.js +186 -0
  8. package/package.json +69 -0
  9. package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
  10. package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
  11. package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
  12. package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
  13. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
  14. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
  15. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
  16. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
  17. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
  18. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
  19. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
  20. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
  21. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
  22. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
  23. package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
  24. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
  25. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
  26. package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
  27. package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
  28. package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
  29. package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
  30. package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
  31. package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
  32. package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
  33. package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
  34. package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
  35. package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
  36. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
  37. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
  38. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
  39. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
  40. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
  41. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
  42. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
  43. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
  44. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
  45. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
  46. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
  47. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
  48. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
  49. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
  50. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
  51. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
  52. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
  53. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
  54. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
  55. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
  56. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
  57. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
  58. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
  59. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
  60. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
  61. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
  62. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
  63. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
  64. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
  65. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
  66. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
  67. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
  68. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
  69. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
  70. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
  71. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
  72. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
  73. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
  74. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
  75. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
  76. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
  77. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
  78. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
  79. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
  80. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
  81. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
  82. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
  83. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
  84. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
  85. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
  86. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
  87. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
  88. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
  89. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
  90. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
  91. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
  92. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
  93. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
  94. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
  95. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
  96. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
  97. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
  98. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
  99. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
  100. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
  101. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
  102. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
  103. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
  104. package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
  105. package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
  106. package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
  107. package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
  108. package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
  109. package/third-party/indic-processor.js +565 -0
@@ -0,0 +1,1213 @@
1
+ /**
2
+ *
3
+ * Copyright (c) 2013-present, Anoop Kunchukuttan
4
+ * All rights reserved.
5
+ *
6
+ * This source code is licensed under the MIT license found in the
7
+ * INDIC_NPL_LICENCE file in the indicnlp directory of this source tree.
8
+ *
9
+ * This code is a ported version of the sacremoses library. Please refer to NOTICE
10
+ * file in the root directory of this source tree.
11
+ */
12
+
13
+ const langinfo = require('./langinfo')
14
+
15
+ class NormalizerI {
16
+ /**
17
+ * The normalizer classes do the following:
18
+ * * Some characters have multiple Unicode codepoints. The normalizer chooses a single standard representation
19
+ * * Some control characters are deleted
20
+ * * While typing using the Latin keyboard, certain typical mistakes occur which are corrected by the module
21
+ * Base class for normalizer. Performs some common normalization, which includes:
22
+ * * Byte order mark, word joiner, etc. removal
23
+ * * ZERO_WIDTH_NON_JOINER and ZERO_WIDTH_JOINER removal
24
+ * * ZERO_WIDTH_SPACE and NO_BREAK_SPACE replaced by spaces
25
+ * Script specific normalizers should derive from this class and override the normalize() method.
26
+ * They can call the super class 'normalize() method to avail of the common normalization
27
+ */
28
+
29
+ static BYTE_ORDER_MARK = '\uFEFF'
30
+ static BYTE_ORDER_MARK_2 = '\uFFFE'
31
+ static WORD_JOINER = '\u2060'
32
+ static SOFT_HYPHEN = '\u00AD'
33
+
34
+ static ZERO_WIDTH_SPACE = '\u200B'
35
+ static NO_BREAK_SPACE = '\u00A0'
36
+
37
+ static ZERO_WIDTH_NON_JOINER = '\u200C'
38
+ static ZERO_WIDTH_JOINER = '\u200D'
39
+
40
+ _normalizePunctuations (text) {
41
+ /**
42
+ * Normalize punctuations.
43
+ * Applied many of the punctuation normalizations that are part of MosesNormalizer
44
+ * from sacremoses
45
+ */
46
+ text = text.replace(NormalizerI.BYTE_ORDER_MARK, '')
47
+ text = text.replace(/„/g, '"')
48
+ text = text.replace(/"/g, '"')
49
+ text = text.replace(/"/g, '"')
50
+ text = text.replace(/–/g, '-')
51
+ text = text.replace(/—/g, ' - ')
52
+ text = text.replace(/´/g, "'")
53
+ text = text.replace(/'/g, "'")
54
+ text = text.replace(/‚/g, "'")
55
+ text = text.replace(/'/g, "'")
56
+ text = text.replace(/''/g, '"')
57
+ text = text.replace(/´´/g, '"')
58
+ text = text.replace(/…/g, '...')
59
+
60
+ return text
61
+ }
62
+
63
+ normalize (text) {
64
+ // Method to be implemented by subclasses
65
+ }
66
+ }
67
+
68
+ class BaseNormalizer extends NormalizerI {
69
+ /**
70
+ * Base normalizer for Indic scripts
71
+ * @param {string} lang - Language code
72
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
73
+ * @param {string} nasalsMode - How to handle nasal characters ('do_nothing', 'to_anusvaara_strict', 'to_anusvaara_relaxed', 'to_nasal_consonants')
74
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
75
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
76
+ */
77
+ constructor (
78
+ lang = 'hi',
79
+ removeNuktas = false,
80
+ nasalsMode = 'do_nothing',
81
+ doNormalizeChandras = false,
82
+ doNormalizeVowelEnding = false
83
+ ) {
84
+ super()
85
+ this.lang = lang
86
+ this.removeNuktas = removeNuktas
87
+ this.nasalsMode = nasalsMode
88
+ this.doNormalizeChandras = doNormalizeChandras
89
+ this.doNormalizeVowelEnding = doNormalizeVowelEnding
90
+
91
+ this._initNormalizeChandras()
92
+ this._initNormalizeNasals()
93
+ this._initNormalizeVowelEnding()
94
+ }
95
+
96
+ _initNormalizeVowelEnding () {
97
+ if (langinfo.IE_LANGUAGES.includes(this.lang)) {
98
+ this.fnVowelEnding = this._normalizeWordVowelEndingIe
99
+ } else if (langinfo.DRAVIDIAN_LANGUAGES.includes(this.lang)) {
100
+ this.fnVowelEnding = this._normalizeWordVowelEndingDravidian
101
+ } else {
102
+ this.fnVowelEnding = (x) => x
103
+ }
104
+ }
105
+
106
+ _initNormalizeChandras () {
107
+ const substitutionOffsets = [
108
+ [0x0d, 0x0f], // chandra e, independent
109
+ [0x11, 0x13], // chandra o, independent
110
+ [0x45, 0x47], // chandra e, dependent
111
+ [0x49, 0x4b], // chandra o, dependent
112
+ // [0x72, 0x0f], // mr: chandra e, independent
113
+
114
+ [0x00, 0x02], // chandrabindu
115
+ [0x01, 0x02] // chandrabindu
116
+ ]
117
+
118
+ this.chandraSubstitutions = substitutionOffsets.map((x) => [
119
+ langinfo.offsetToChar(x[0], this.lang),
120
+ langinfo.offsetToChar(x[1], this.lang)
121
+ ])
122
+ }
123
+
124
+ _normalizeChandras (text) {
125
+ for (const [match, repl] of this.chandraSubstitutions) {
126
+ text = text.replace(new RegExp(match, 'g'), repl)
127
+ }
128
+ return text
129
+ }
130
+
131
+ _initToAnusvaaraStrict () {
132
+ const patSignatures = [
133
+ [0x19, 0x15, 0x18],
134
+ [0x1e, 0x1a, 0x1d],
135
+ [0x23, 0x1f, 0x22],
136
+ [0x28, 0x24, 0x27],
137
+ [0x29, 0x24, 0x27],
138
+ [0x2e, 0x2a, 0x2d]
139
+ ]
140
+
141
+ const halantOffset = 0x4d
142
+ const anusvaraOffset = 0x02
143
+
144
+ const pats = []
145
+
146
+ for (const patSignature of patSignatures) {
147
+ const pat = new RegExp(
148
+ `${langinfo.offsetToChar(
149
+ patSignature[0],
150
+ this.lang
151
+ )}${langinfo.offsetToChar(
152
+ halantOffset,
153
+ this.lang
154
+ )}([${langinfo.offsetToChar(
155
+ patSignature[1],
156
+ this.lang
157
+ )}-${langinfo.offsetToChar(patSignature[2], this.lang)}])`,
158
+ 'g'
159
+ )
160
+ pats.push(pat)
161
+ }
162
+
163
+ const replString = `${langinfo.offsetToChar(anusvaraOffset, this.lang)}$1`
164
+
165
+ this.patsRepls = [pats, replString]
166
+ }
167
+
168
+ _toAnusvaaraStrict (text) {
169
+ const [pats, replString] = this.patsRepls
170
+ for (const pat of pats) {
171
+ text = text.replace(pat, replString)
172
+ }
173
+ return text
174
+ }
175
+
176
+ _initToAnusvaaraRelaxed () {
177
+ const nasalsList = [0x19, 0x1e, 0x23, 0x28, 0x29, 0x2e]
178
+ const nasalsListStr = nasalsList
179
+ .map((x) => langinfo.offsetToChar(x, this.lang))
180
+ .join('')
181
+
182
+ const halantOffset = 0x4d
183
+ const anusvaraOffset = 0x02
184
+
185
+ const pat = new RegExp(
186
+ `[${nasalsListStr}]${langinfo.offsetToChar(halantOffset, this.lang)}`,
187
+ 'g'
188
+ )
189
+ const replString = langinfo.offsetToChar(anusvaraOffset, this.lang)
190
+
191
+ this.patsRepls = [pat, replString]
192
+ }
193
+
194
+ _toAnusvaaraRelaxed (text) {
195
+ const [pat, replString] = this.patsRepls
196
+ return text.replace(pat, replString)
197
+ }
198
+
199
+ _initToNasalConsonants () {
200
+ const patSignatures = [
201
+ [0x19, 0x15, 0x18],
202
+ [0x1e, 0x1a, 0x1d],
203
+ [0x23, 0x1f, 0x22],
204
+ [0x28, 0x24, 0x27],
205
+ [0x29, 0x24, 0x27],
206
+ [0x2e, 0x2a, 0x2d]
207
+ ]
208
+
209
+ const halantOffset = 0x4d
210
+ const anusvaraOffset = 0x02
211
+
212
+ const pats = []
213
+ const replStrings = []
214
+
215
+ for (const patSignature of patSignatures) {
216
+ const pat = new RegExp(
217
+ `${langinfo.offsetToChar(
218
+ anusvaraOffset,
219
+ this.lang
220
+ )}([${langinfo.offsetToChar(
221
+ patSignature[1],
222
+ this.lang
223
+ )}-${langinfo.offsetToChar(patSignature[2], this.lang)}])`,
224
+ 'g'
225
+ )
226
+ pats.push(pat)
227
+
228
+ const replString = `${langinfo.offsetToChar(
229
+ patSignature[0],
230
+ this.lang
231
+ )}${langinfo.offsetToChar(halantOffset, this.lang)}$1`
232
+ replStrings.push(replString)
233
+ }
234
+
235
+ this.patsRepls = pats.map((pat, i) => [pat, replStrings[i]])
236
+ }
237
+
238
+ _toNasalConsonants (text) {
239
+ for (const [pat, repl] of this.patsRepls) {
240
+ text = text.replace(pat, repl)
241
+ }
242
+ return text
243
+ }
244
+
245
+ _initNormalizeNasals () {
246
+ if (this.nasalsMode === 'to_anusvaara_strict') {
247
+ this._initToAnusvaaraStrict()
248
+ } else if (this.nasalsMode === 'to_anusvaara_relaxed') {
249
+ this._initToAnusvaaraRelaxed()
250
+ } else if (this.nasalsMode === 'to_nasal_consonants') {
251
+ this._initToNasalConsonants()
252
+ }
253
+ }
254
+
255
+ _normalizeNasals (text) {
256
+ if (this.nasalsMode === 'to_anusvaara_strict') {
257
+ return this._toAnusvaaraStrict(text)
258
+ } else if (this.nasalsMode === 'to_anusvaara_relaxed') {
259
+ return this._toAnusvaaraRelaxed(text)
260
+ } else if (this.nasalsMode === 'to_nasal_consonants') {
261
+ return this._toNasalConsonants(text)
262
+ } else {
263
+ return text
264
+ }
265
+ }
266
+
267
+ _normalizeWordVowelEndingDravidian (word) {
268
+ /**
269
+ * For Dravidian
270
+ * - consonant ending: add 'a' ki maatra
271
+ * - halant ending: no change
272
+ * - 'a' ki maatra: no change
273
+ */
274
+ if (
275
+ word.length > 0 &&
276
+ langinfo.isConsonant(word.charAt(word.length - 1), this.lang)
277
+ ) {
278
+ return word + langinfo.offsetToChar(0x3e, this.lang)
279
+ } else {
280
+ return word
281
+ }
282
+ }
283
+
284
+ _normalizeWordVowelEndingIe (word) {
285
+ /**
286
+ * For IE
287
+ * - consonant ending: add halant
288
+ * - halant ending: no change
289
+ * - 'a' ki maatra: no change
290
+ */
291
+ if (
292
+ word.length > 0 &&
293
+ langinfo.isConsonant(word.charAt(word.length - 1), this.lang)
294
+ ) {
295
+ return word + langinfo.offsetToChar(langinfo.HALANTA_OFFSET, this.lang)
296
+ } else {
297
+ return word
298
+ }
299
+ }
300
+
301
+ _normalizeVowelEnding (text) {
302
+ return text
303
+ .split(' ')
304
+ .map((w) => this.fnVowelEnding(w))
305
+ .join(' ')
306
+ }
307
+
308
+ normalize (text) {
309
+ /**
310
+ * Method to be implemented for normalization for each script
311
+ */
312
+ text = text.replace(NormalizerI.BYTE_ORDER_MARK, '')
313
+ text = text.replace(NormalizerI.BYTE_ORDER_MARK_2, '')
314
+ text = text.replace(NormalizerI.WORD_JOINER, '')
315
+ text = text.replace(NormalizerI.SOFT_HYPHEN, '')
316
+
317
+ text = text.replace(NormalizerI.ZERO_WIDTH_SPACE, ' ') // ??
318
+ text = text.replace(NormalizerI.NO_BREAK_SPACE, ' ')
319
+
320
+ text = text.replace(NormalizerI.ZERO_WIDTH_NON_JOINER, '')
321
+ text = text.replace(NormalizerI.ZERO_WIDTH_JOINER, '')
322
+
323
+ text = this._normalizePunctuations(text)
324
+
325
+ if (this.doNormalizeChandras) {
326
+ text = this._normalizeChandras(text)
327
+ }
328
+ text = this._normalizeNasals(text)
329
+ if (this.doNormalizeVowelEnding) {
330
+ text = this._normalizeVowelEnding(text)
331
+ }
332
+
333
+ return text
334
+ }
335
+
336
+ getCharStats (text) {
337
+ console.log(
338
+ text.match(new RegExp(NormalizerI.BYTE_ORDER_MARK, 'g'))?.length || 0
339
+ )
340
+ console.log(
341
+ text.match(new RegExp(NormalizerI.BYTE_ORDER_MARK_2, 'g'))?.length || 0
342
+ )
343
+ console.log(
344
+ text.match(new RegExp(NormalizerI.WORD_JOINER, 'g'))?.length || 0
345
+ )
346
+ console.log(
347
+ text.match(new RegExp(NormalizerI.SOFT_HYPHEN, 'g'))?.length || 0
348
+ )
349
+
350
+ console.log(
351
+ text.match(new RegExp(NormalizerI.ZERO_WIDTH_SPACE, 'g'))?.length || 0
352
+ )
353
+ console.log(
354
+ text.match(new RegExp(NormalizerI.NO_BREAK_SPACE, 'g'))?.length || 0
355
+ )
356
+
357
+ console.log(
358
+ text.match(new RegExp(NormalizerI.ZERO_WIDTH_NON_JOINER, 'g'))?.length ||
359
+ 0
360
+ )
361
+ console.log(
362
+ text.match(new RegExp(NormalizerI.ZERO_WIDTH_JOINER, 'g'))?.length || 0
363
+ )
364
+ }
365
+
366
+ correctVisarga (text, visargaChar, charRange) {
367
+ return text.replace(/([^\u0900-\u097f]):/g, '$1\u0903')
368
+ }
369
+ }
370
+
371
+ class DevanagariNormalizer extends BaseNormalizer {
372
+ /**
373
+ * Normalizer for the Devanagari script. In addition to basic normalization by the super class,
374
+ * * Replaces the composite characters containing nuktas by their decomposed form
375
+ * * replace pipe character '|' by poorna virama character
376
+ * * replace colon ':' by visarga if the colon follows a charcter in this script
377
+ */
378
+
379
+ static NUKTA = '\u093C'
380
+
381
+ /**
382
+ * Constructor for DevanagariNormalizer
383
+ * @param {string} lang - Language code
384
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
385
+ * @param {string} nasalsMode - How to handle nasal characters
386
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
387
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
388
+ */
389
+ constructor (
390
+ lang = 'hi',
391
+ removeNuktas = false,
392
+ nasalsMode = 'do_nothing',
393
+ doNormalizeChandras = false,
394
+ doNormalizeVowelEnding = false
395
+ ) {
396
+ super(
397
+ lang,
398
+ removeNuktas,
399
+ nasalsMode,
400
+ doNormalizeChandras,
401
+ doNormalizeVowelEnding
402
+ )
403
+ }
404
+
405
+ normalize (text) {
406
+ // Common normalization for Indic scripts
407
+ text = super.normalize(text)
408
+
409
+ // chandra a replacement for Marathi
410
+ text = text.replace('\u0972', '\u090f')
411
+
412
+ // decomposing Nukta based composite characters
413
+ text = text.replace('\u0929', '\u0928' + DevanagariNormalizer.NUKTA)
414
+ text = text.replace('\u0931', '\u0930' + DevanagariNormalizer.NUKTA)
415
+ text = text.replace('\u0934', '\u0933' + DevanagariNormalizer.NUKTA)
416
+ text = text.replace('\u0958', '\u0915' + DevanagariNormalizer.NUKTA)
417
+ text = text.replace('\u0959', '\u0916' + DevanagariNormalizer.NUKTA)
418
+ text = text.replace('\u095A', '\u0917' + DevanagariNormalizer.NUKTA)
419
+ text = text.replace('\u095B', '\u091C' + DevanagariNormalizer.NUKTA)
420
+ text = text.replace('\u095C', '\u0921' + DevanagariNormalizer.NUKTA)
421
+ text = text.replace('\u095D', '\u0922' + DevanagariNormalizer.NUKTA)
422
+ text = text.replace('\u095E', '\u092B' + DevanagariNormalizer.NUKTA)
423
+ text = text.replace('\u095F', '\u092F' + DevanagariNormalizer.NUKTA)
424
+
425
+ if (this.removeNuktas) {
426
+ text = text.replace(new RegExp(DevanagariNormalizer.NUKTA, 'g'), '')
427
+ }
428
+
429
+ // replace pipe character for poorna virama
430
+ text = text.replace('\u007c', '\u0964')
431
+
432
+ // correct visarga
433
+ text = text.replace(/([ऀ-ॿ]):/, '$1\u0903')
434
+
435
+ return text
436
+ }
437
+
438
+ getCharStats (text) {
439
+ super.getCharStats(text)
440
+
441
+ console.log(text.match(/\u0929/g)?.length || 0)
442
+ console.log(text.match(/\u0931/g)?.length || 0)
443
+ console.log(text.match(/\u0934/g)?.length || 0)
444
+ console.log(text.match(/\u0958/g)?.length || 0)
445
+ console.log(text.match(/\u0959/g)?.length || 0)
446
+ console.log(text.match(/\u095A/g)?.length || 0)
447
+ console.log(text.match(/\u095B/g)?.length || 0)
448
+ console.log(text.match(/\u095C/g)?.length || 0)
449
+ console.log(text.match(/\u095D/g)?.length || 0)
450
+ console.log(text.match(/\u095E/g)?.length || 0)
451
+ console.log(text.match(/\u095F/g)?.length || 0)
452
+ }
453
+ }
454
+
455
+ class GurmukhiNormalizer extends BaseNormalizer {
456
+ /**
457
+ * Normalizer for the Gurmukhi script. In addition to basic normalization by the super class,
458
+ * * Replaces the composite characters containing nuktas by their decomposed form
459
+ * * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
460
+ * * replace pipe character '|' by poorna virama character
461
+ * * replace colon ':' by visarga if the colon follows a charcter in this script
462
+ */
463
+
464
+ static NUKTA = '\u0A3C'
465
+
466
+ static VOWEL_NORM_MAPS = {
467
+ // http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
468
+ // Table 12-16
469
+ ਅਾ: '\u0a06',
470
+ ੲਿ: '\u0a07',
471
+ ੲੀ: '\u0a08',
472
+ ੳੁ: '\u0a09',
473
+ ੳੂ: '\u0a0a',
474
+ ੲੇ: '\u0a0f',
475
+ ਅੈ: '\u0a10',
476
+ ੳੋ: '\u0a13',
477
+ ਅੌ: '\u0a14'
478
+ }
479
+
480
+ /**
481
+ * Constructor for GurmukhiNormalizer
482
+ * @param {string} lang - Language code
483
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
484
+ * @param {string} nasalsMode - How to handle nasal characters
485
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
486
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
487
+ * @param {boolean} doCanonicalizeAddak - Whether to canonicalize addak
488
+ * @param {boolean} doCanonalizeTippi - Whether to canonicalize tippi
489
+ * @param {boolean} doReplaceVowelBases - Whether to replace vowel bases
490
+ */
491
+ constructor (
492
+ lang = 'pa',
493
+ removeNuktas = false,
494
+ nasalsMode = 'do_nothing',
495
+ doNormalizeChandras = false,
496
+ doNormalizeVowelEnding = false,
497
+ doCanonicalizeAddak = false,
498
+ doCanonalizeTippi = false,
499
+ doReplaceVowelBases = false
500
+ ) {
501
+ super(
502
+ lang,
503
+ removeNuktas,
504
+ nasalsMode,
505
+ doNormalizeChandras,
506
+ doNormalizeVowelEnding
507
+ )
508
+ this.doCanonicalizeAddak = doCanonicalizeAddak
509
+ this.doCanonalizeTippi = doCanonalizeTippi
510
+ this.doReplaceVowelBases = doReplaceVowelBases
511
+ }
512
+
513
+ _normalizeVowels (text) {
514
+ // standard vowel replacements as per suggestions in
515
+ // http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
516
+ // Table 12-16
517
+ for (const [k, v] of Object.entries(GurmukhiNormalizer.VOWEL_NORM_MAPS)) {
518
+ text = text.replace(new RegExp(k, 'g'), v)
519
+ }
520
+
521
+ // If these special characters occur without any diacritic, replace them with closet
522
+ // equivalent vowels
523
+ if (this.doReplaceVowelBases) {
524
+ text = text.replace(/\u0a72/g, '\u0a07')
525
+ text = text.replace(/\u0a73/g, '\u0a09')
526
+ }
527
+
528
+ return text
529
+ }
530
+
531
+ normalize (text) {
532
+ // Addak
533
+ if (this.doCanonicalizeAddak) {
534
+ // replace addak+consonant with consonat+halant+consonant
535
+ text = text.replace(/\u0a71(.)/g, '$1\u0a4d$1')
536
+ }
537
+
538
+ // Tippi
539
+ if (this.doCanonalizeTippi) {
540
+ text = text.replace(/\u0a70/g, '\u0a02')
541
+ }
542
+
543
+ // Vowels: Gurumuki has multiple ways of representing independent vowels due
544
+ // to the characters 'iri' and 'ura'.
545
+ text = this._normalizeVowels(text)
546
+
547
+ // common normalization for Indic scripts
548
+ text = super.normalize(text)
549
+
550
+ // decomposing Nukta based composite characters
551
+ text = text.replace('\u0a33', '\u0a32' + GurmukhiNormalizer.NUKTA)
552
+ text = text.replace('\u0a36', '\u0a38' + GurmukhiNormalizer.NUKTA)
553
+ text = text.replace('\u0a59', '\u0a16' + GurmukhiNormalizer.NUKTA)
554
+ text = text.replace('\u0a5a', '\u0a17' + GurmukhiNormalizer.NUKTA)
555
+ text = text.replace('\u0a5b', '\u0a1c' + GurmukhiNormalizer.NUKTA)
556
+ text = text.replace('\u0a5e', '\u0a2b' + GurmukhiNormalizer.NUKTA)
557
+
558
+ if (this.removeNuktas) {
559
+ text = text.replace(new RegExp(GurmukhiNormalizer.NUKTA, 'g'), '')
560
+ }
561
+
562
+ // replace the poorna virama codes specific to script
563
+ // with generic Indic script codes
564
+ text = text.replace('\u0a64', '\u0964')
565
+ text = text.replace('\u0a65', '\u0965')
566
+
567
+ // replace pipe character for poorna virama
568
+ text = text.replace('\u007c', '\u0964')
569
+
570
+ // correct visarga
571
+ text = text.replace(/([਀-੿]):/, '$1\u0a03')
572
+
573
+ return text
574
+ }
575
+ }
576
+
577
+ class GujaratiNormalizer extends BaseNormalizer {
578
+ /**
579
+ * Normalizer for the Gujarati script. In addition to basic normalization by the super class,
580
+ * * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
581
+ * * replace colon ':' by visarga if the colon follows a charcter in this script
582
+ */
583
+
584
+ static NUKTA = '\u0ABC'
585
+
586
+ /**
587
+ * Constructor for GujaratiNormalizer
588
+ * @param {string} lang - Language code
589
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
590
+ * @param {string} nasalsMode - How to handle nasal characters
591
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
592
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
593
+ */
594
+ constructor (
595
+ lang = 'gu',
596
+ removeNuktas = false,
597
+ nasalsMode = 'do_nothing',
598
+ doNormalizeChandras = false,
599
+ doNormalizeVowelEnding = false
600
+ ) {
601
+ super(
602
+ lang,
603
+ removeNuktas,
604
+ nasalsMode,
605
+ doNormalizeChandras,
606
+ doNormalizeVowelEnding
607
+ )
608
+ }
609
+
610
+ normalize (text) {
611
+ // common normalization for Indic scripts
612
+ text = super.normalize(text)
613
+
614
+ // decomposing Nukta based composite characters
615
+ if (this.removeNuktas) {
616
+ text = text.replace(new RegExp(GujaratiNormalizer.NUKTA, 'g'), '')
617
+ }
618
+
619
+ // replace the poorna virama codes specific to script
620
+ // with generic Indic script codes
621
+ text = text.replace('\u0ae4', '\u0964')
622
+ text = text.replace('\u0ae5', '\u0965')
623
+
624
+ // correct visarga
625
+ text = text.replace(/([઀-૿]):/, '$1\u0a83')
626
+
627
+ return text
628
+ }
629
+ }
630
+
631
+ class OriyaNormalizer extends BaseNormalizer {
632
+ /**
633
+ * Normalizer for the Oriya script. In addition to basic normalization by the super class,
634
+ * * Replaces the composite characters containing nuktas by their decomposed form
635
+ * * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
636
+ * * Canonicalize two part dependent vowels
637
+ * * Replace 'va' with 'ba'
638
+ * * replace pipe character '|' by poorna virama character
639
+ * * replace colon ':' by visarga if the colon follows a charcter in this script
640
+ */
641
+
642
+ static NUKTA = '\u0B3C'
643
+
644
+ static VOWEL_NORM_MAPS = {
645
+ // See Table 12-22 in http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
646
+ ଅା: '\u0b06',
647
+ ଏୗ: '\u0b10',
648
+ ଓୗ: '\u0b14'
649
+ }
650
+
651
+ /**
652
+ * Constructor for OriyaNormalizer
653
+ * @param {string} lang - Language code
654
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
655
+ * @param {string} nasalsMode - How to handle nasal characters
656
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
657
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
658
+ * @param {boolean} doRemapWa - Whether to remap wa
659
+ */
660
+ constructor (
661
+ lang = 'or',
662
+ removeNuktas = false,
663
+ nasalsMode = 'do_nothing',
664
+ doNormalizeChandras = false,
665
+ doNormalizeVowelEnding = false,
666
+ doRemapWa = false
667
+ ) {
668
+ super(
669
+ lang,
670
+ removeNuktas,
671
+ nasalsMode,
672
+ doNormalizeChandras,
673
+ doNormalizeVowelEnding
674
+ )
675
+ this.doRemapWa = doRemapWa
676
+ }
677
+
678
+ normalize (text) {
679
+ // common normalization for Indic scripts
680
+ text = super.normalize(text)
681
+
682
+ // standard vowel replacements as per suggestions in Unicode documents
683
+ for (const [k, v] of Object.entries(OriyaNormalizer.VOWEL_NORM_MAPS)) {
684
+ text = text.replace(new RegExp(k, 'g'), v)
685
+ }
686
+
687
+ // decomposing Nukta based composite characters
688
+ text = text.replace('\u0b5c', '\u0b21' + OriyaNormalizer.NUKTA)
689
+ text = text.replace('\u0b5d', '\u0b22' + OriyaNormalizer.NUKTA)
690
+
691
+ if (this.removeNuktas) {
692
+ text = text.replace(new RegExp(OriyaNormalizer.NUKTA, 'g'), '')
693
+ }
694
+
695
+ // replace the poorna virama codes specific to script
696
+ // with generic Indic script codes
697
+ text = text.replace('\u0b64', '\u0964')
698
+ text = text.replace('\u0b65', '\u0965')
699
+
700
+ // replace pipe character for poorna virama
701
+ text = text.replace('\u0b7c', '\u0964')
702
+
703
+ // replace wa with ba
704
+ if (this.doRemapWa) {
705
+ text = text.replace('\u0b71', '\u0b2c')
706
+ }
707
+
708
+ // replace va with ba
709
+ // NOTE: documentation (chapter on Indic scripts) and codepoint chart seem contradictory
710
+ // (this applied to wa to ba rule also above)
711
+ text = text.replace('\u0b35', '\u0b2c')
712
+
713
+ // AI dependent vowel sign
714
+ text = text.replace('\u0b47\u0b56', '\u0b58')
715
+
716
+ // two part dependent vowels
717
+ text = text.replace('\u0b47\u0b3e', '\u0b4b')
718
+ text = text.replace('\u0b47\u0b57', '\u0b4c')
719
+
720
+ // additional consonant - not clear how to handle this
721
+ // ignore
722
+
723
+ // correct visarga
724
+ text = text.replace(/([଀-୿]):/, '$1\u0b03')
725
+
726
+ return text
727
+ }
728
+ }
729
+
730
+ class BengaliNormalizer extends BaseNormalizer {
731
+ /**
732
+ * Normalizer for the Bengali script. In addition to basic normalization by the super class,
733
+ * * Replaces the composite characters containing nuktas by their decomposed form
734
+ * * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
735
+ * * Canonicalize two part dependent vowels
736
+ * * replace pipe character '|' by poorna virama character
737
+ * * replace colon ':' by visarga if the colon follows a charcter in this script
738
+ */
739
+
740
+ static NUKTA = '\u09BC'
741
+
742
+ /**
743
+ * Constructor for BengaliNormalizer
744
+ * @param {string} lang - Language code
745
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
746
+ * @param {string} nasalsMode - How to handle nasal characters
747
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
748
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
749
+ * @param {boolean} doRemapAssameseChars - Whether to remap Assamese characters
750
+ */
751
+ constructor (
752
+ lang = 'bn',
753
+ removeNuktas = false,
754
+ nasalsMode = 'do_nothing',
755
+ doNormalizeChandras = false,
756
+ doNormalizeVowelEnding = false,
757
+ doRemapAssameseChars = false
758
+ ) {
759
+ super(
760
+ lang,
761
+ removeNuktas,
762
+ nasalsMode,
763
+ doNormalizeChandras,
764
+ doNormalizeVowelEnding
765
+ )
766
+ this.doRemapAssameseChars = doRemapAssameseChars
767
+ }
768
+
769
+ normalize (text) {
770
+ // common normalization for Indic scripts
771
+ text = super.normalize(text)
772
+
773
+ // decomposing Nukta based composite characters
774
+ text = text.replace('\u09dc', '\u09a1' + BengaliNormalizer.NUKTA)
775
+ text = text.replace('\u09dd', '\u09a2' + BengaliNormalizer.NUKTA)
776
+ text = text.replace('\u09df', '\u09af' + BengaliNormalizer.NUKTA)
777
+
778
+ if (this.removeNuktas) {
779
+ text = text.replace(new RegExp(BengaliNormalizer.NUKTA, 'g'), '')
780
+ }
781
+
782
+ if (this.doRemapAssameseChars && this.lang === 'as') {
783
+ text = text.replace('\u09f0', '\u09b0') // 'ra' character
784
+ text = text.replace('\u09f1', '\u09ac') // 'va' character
785
+ }
786
+
787
+ // replace the poorna virama codes specific to script
788
+ // with generic Indic script codes
789
+ text = text.replace('\u09e4', '\u0964')
790
+ text = text.replace('\u09e5', '\u0965')
791
+
792
+ // replace pipe character for poorna virama
793
+ text = text.replace('\u007c', '\u0964')
794
+ // replace bengali currency numerator four for poorna virama (it looks similar and is used as a substitute)
795
+ text = text.replace('\u09f7', '\u0964')
796
+
797
+ // two part dependent vowels
798
+ text = text.replace('\u09c7\u09be', '\u09cb')
799
+ text = text.replace('\u09c7\u09d7', '\u09cc')
800
+
801
+ // correct visarga
802
+ text = text.replace(/([ঀ-৿]):/, '$1\u0983')
803
+
804
+ return text
805
+ }
806
+ }
807
+
808
+ class TamilNormalizer extends BaseNormalizer {
809
+ /**
810
+ * Normalizer for the Tamil script. In addition to basic normalization by the super class,
811
+ * * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
812
+ * * canonicalize two-part dependent vowel signs
813
+ * * replace colon ':' by visarga if the colon follows a charcter in this script
814
+ */
815
+
816
+ /**
817
+ * Constructor for TamilNormalizer
818
+ * @param {string} lang - Language code
819
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
820
+ * @param {string} nasalsMode - How to handle nasal characters
821
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
822
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
823
+ */
824
+ constructor (
825
+ lang = 'ta',
826
+ removeNuktas = false,
827
+ nasalsMode = 'do_nothing',
828
+ doNormalizeChandras = false,
829
+ doNormalizeVowelEnding = false
830
+ ) {
831
+ super(
832
+ lang,
833
+ removeNuktas,
834
+ nasalsMode,
835
+ doNormalizeChandras,
836
+ doNormalizeVowelEnding
837
+ )
838
+ }
839
+
840
+ normalize (text) {
841
+ // common normalization for Indic scripts
842
+ text = super.normalize(text)
843
+
844
+ // replace the poorna virama codes specific to script
845
+ // with generic Indic script codes
846
+ text = text.replace('\u0be4', '\u0964')
847
+ text = text.replace('\u0be5', '\u0965')
848
+
849
+ // two part dependent vowels
850
+ text = text.replace('\u0b92\u0bd7', '\u0b94')
851
+ text = text.replace('\u0bc6\u0bbe', '\u0bca')
852
+ text = text.replace('\u0bc7\u0bbe', '\u0bcb')
853
+ text = text.replace('\u0bc6\u0bd7', '\u0bcc')
854
+
855
+ // correct visarga
856
+ text = text.replace(/([஀-௿]):/, '$1\u0b83')
857
+
858
+ return text
859
+ }
860
+ }
861
+
862
+ class TeluguNormalizer extends BaseNormalizer {
863
+ /**
864
+ * Normalizer for the Telugu script. In addition to basic normalization by the super class,
865
+ * * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
866
+ * * canonicalize two-part dependent vowel signs
867
+ * * replace colon ':' by visarga if the colon follows a charcter in this script
868
+ */
869
+
870
+ /**
871
+ * Constructor for TeluguNormalizer
872
+ * @param {string} lang - Language code
873
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
874
+ * @param {string} nasalsMode - How to handle nasal characters
875
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
876
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
877
+ */
878
+ constructor (
879
+ lang = 'te',
880
+ removeNuktas = false,
881
+ nasalsMode = 'do_nothing',
882
+ doNormalizeChandras = false,
883
+ doNormalizeVowelEnding = false
884
+ ) {
885
+ super(
886
+ lang,
887
+ removeNuktas,
888
+ nasalsMode,
889
+ doNormalizeChandras,
890
+ doNormalizeVowelEnding
891
+ )
892
+ }
893
+
894
+ normalize (text) {
895
+ // common normalization for Indic scripts
896
+ text = super.normalize(text)
897
+
898
+ // replace the poorna virama codes specific to script
899
+ // with generic Indic script codes
900
+ text = text.replace('\u0c64', '\u0964')
901
+ text = text.replace('\u0c65', '\u0965')
902
+
903
+ // dependent vowels
904
+ text = text.replace('\u0c46\u0c56', '\u0c48')
905
+
906
+ // correct visarga
907
+ text = text.replace(/([౦-౿]):/, '$1\u0c03')
908
+
909
+ return text
910
+ }
911
+
912
+ getCharStats (text) {
913
+ // Empty implementation
914
+ }
915
+ }
916
+
917
+ class KannadaNormalizer extends BaseNormalizer {
918
+ /**
919
+ * Normalizer for the Kannada script. In addition to basic normalization by the super class,
920
+ * * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
921
+ * * canonicalize two-part dependent vowel signs
922
+ * * replace colon ':' by visarga if the colon follows a charcter in this script
923
+ */
924
+
925
+ /**
926
+ * Constructor for KannadaNormalizer
927
+ * @param {string} lang - Language code
928
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
929
+ * @param {string} nasalsMode - How to handle nasal characters
930
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
931
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
932
+ */
933
+ constructor (
934
+ lang = 'kn',
935
+ removeNuktas = false,
936
+ nasalsMode = 'do_nothing',
937
+ doNormalizeChandras = false,
938
+ doNormalizeVowelEnding = false
939
+ ) {
940
+ super(
941
+ lang,
942
+ removeNuktas,
943
+ nasalsMode,
944
+ doNormalizeChandras,
945
+ doNormalizeVowelEnding
946
+ )
947
+ }
948
+
949
+ normalize (text) {
950
+ // common normalization for Indic scripts
951
+ text = super.normalize(text)
952
+
953
+ // replace the poorna virama codes specific to script
954
+ // with generic Indic script codes
955
+ text = text.replace('\u0ce4', '\u0964')
956
+ text = text.replace('\u0ce5', '\u0965')
957
+
958
+ // dependent vowels
959
+ text = text.replace('\u0cbf\u0cd5', '\u0cc0')
960
+ text = text.replace('\u0cc6\u0cd5', '\u0cc7')
961
+ text = text.replace('\u0cc6\u0cd6', '\u0cc8')
962
+ text = text.replace('\u0cc6\u0cc2', '\u0cca')
963
+ text = text.replace('\u0cca\u0cd5', '\u0ccb')
964
+
965
+ // correct visarga
966
+ text = text.replace(/([ಂ-ೲ]):/, '$1\u0c83')
967
+
968
+ return text
969
+ }
970
+ }
971
+
972
+ class MalayalamNormalizer extends BaseNormalizer {
973
+ /**
974
+ * Normalizer for the Malayalam script. In addition to basic normalization by the super class,
975
+ * * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
976
+ * * canonicalize two-part dependent vowel signs
977
+ * * Change from old encoding of chillus (till Unicode 5.0) to new encoding
978
+ * * replace colon ':' by visarga if the colon follows a charcter in this script
979
+ */
980
+
981
+ static CHILLU_CHAR_MAP = {
982
+ ൺ: '\u0d23',
983
+ ൻ: '\u0d28',
984
+ ർ: '\u0d30',
985
+ ൽ: '\u0d32',
986
+ ൾ: '\u0d33',
987
+ ൿ: '\u0d15'
988
+ }
989
+
990
+ _canonicalizeChillus (text) {
991
+ for (const [chillu, char] of Object.entries(
992
+ MalayalamNormalizer.CHILLU_CHAR_MAP
993
+ )) {
994
+ text = text.replace(new RegExp(chillu, 'g'), `${char}\u0d4d`)
995
+ }
996
+ return text
997
+ }
998
+
999
+ _correctGeminatedT (text) {
1000
+ return text.replace('\u0d31\u0d4d\u0d31', '\u0d1f\u0d4d\u0d1f')
1001
+ }
1002
+
1003
+ /**
1004
+ * Constructor for MalayalamNormalizer
1005
+ * @param {string} lang - Language code
1006
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
1007
+ * @param {string} nasalsMode - How to handle nasal characters
1008
+ * @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
1009
+ * @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
1010
+ * @param {boolean} doCanonicalizeChillus - Whether to canonicalize chillus
1011
+ * @param {boolean} doCorrectGeminatedT - Whether to correct geminated T
1012
+ */
1013
+ constructor (
1014
+ lang = 'ml',
1015
+ removeNuktas = false,
1016
+ nasalsMode = 'do_nothing',
1017
+ doNormalizeChandras = false,
1018
+ doNormalizeVowelEnding = false,
1019
+ doCanonicalizeChillus = false,
1020
+ doCorrectGeminatedT = false
1021
+ ) {
1022
+ super(
1023
+ lang,
1024
+ removeNuktas,
1025
+ nasalsMode,
1026
+ doNormalizeChandras,
1027
+ doNormalizeVowelEnding
1028
+ )
1029
+ this.doCanonicalizeChillus = doCanonicalizeChillus
1030
+ this.doCorrectGeminatedT = doCorrectGeminatedT
1031
+ }
1032
+
1033
+ normalize (text) {
1034
+ // Change from old encoding of chillus (till Unicode 5.0) to new encoding
1035
+ text = text.replace('\u0d23\u0d4d\u200d', '\u0d7a')
1036
+ text = text.replace('\u0d28\u0d4d\u200d', '\u0d7b')
1037
+ text = text.replace('\u0d30\u0d4d\u200d', '\u0d7c')
1038
+ text = text.replace('\u0d32\u0d4d\u200d', '\u0d7d')
1039
+ text = text.replace('\u0d33\u0d4d\u200d', '\u0d7e')
1040
+ text = text.replace('\u0d15\u0d4d\u200d', '\u0d7f')
1041
+
1042
+ // Normalize chillus
1043
+ if (this.doCanonicalizeChillus) {
1044
+ text = this._canonicalizeChillus(text)
1045
+ }
1046
+
1047
+ // common normalization for Indic scripts
1048
+ text = super.normalize(text)
1049
+
1050
+ // replace the poorna virama codes specific to script
1051
+ // with generic Indic script codes
1052
+ text = text.replace('\u0d64', '\u0964')
1053
+ text = text.replace('\u0d65', '\u0965')
1054
+
1055
+ // dependent vowels
1056
+ text = text.replace('\u0d46\u0d3e', '\u0d4a')
1057
+ text = text.replace('\u0d47\u0d3e', '\u0d4b')
1058
+
1059
+ // au forms
1060
+ text = text.replace('\u0d46\u0d57', '\u0d4c')
1061
+ text = text.replace('\u0d57', '\u0d4c')
1062
+
1063
+ // correct geminated T
1064
+ if (this.doCorrectGeminatedT) {
1065
+ text = this._correctGeminatedT(text)
1066
+ }
1067
+
1068
+ // correct visarga
1069
+ text = text.replace(/([ം-ൿ]):/, '$1\u0d03')
1070
+
1071
+ return text
1072
+ }
1073
+ }
1074
+
1075
+ class UrduNormalizer extends NormalizerI {
1076
+ /**
1077
+ * Uses UrduHack library.
1078
+ * https://docs.urduhack.com/en/stable/_modules/urduhack/normalization/character.html#normalize
1079
+ * @param {string} lang - Language code
1080
+ * @param {boolean} removeNuktas - Whether to remove nukta characters
1081
+ */
1082
+ constructor (lang, removeNuktas = true) {
1083
+ super()
1084
+ this.lang = lang
1085
+ this.removeNuktas = removeNuktas
1086
+
1087
+ try {
1088
+ // This is a placeholder for the functionality that would be imported from urduhack
1089
+ // In a real implementation, you would need to include equivalent JavaScript functionality
1090
+ this.normalizeWhitespace = (text) => text.replace(/\s+/g, ' ')
1091
+ this.digitsSpace = (text) =>
1092
+ text
1093
+ .replace(/(\d)([^\d\s])/g, '$1 $2')
1094
+ .replace(/([^\d\s])(\d)/g, '$1 $2')
1095
+ this.allPunctuationsSpace = (text) =>
1096
+ text
1097
+ .replace(/([^\w\s])([^\s])/g, '$1 $2')
1098
+ .replace(/([^\s])([^\w\s])/g, '$1 $2')
1099
+ this.englishCharactersSpace = (text) =>
1100
+ text
1101
+ .replace(/([a-zA-Z])([^a-zA-Z\s])/g, '$1 $2')
1102
+ .replace(/([^a-zA-Z\s])([a-zA-Z])/g, '$1 $2')
1103
+ this.removeDiacritics = (text) => text // Placeholder
1104
+ this.normalizeCharacters = (text) => text // Placeholder
1105
+ this.normalizeCombineCharacters = (text) => text // Placeholder
1106
+
1107
+ console.warn(
1108
+ 'Warning: UrduNormalizer is using placeholder implementations. For full functionality, equivalent JavaScript implementations of urduhack functions are needed.'
1109
+ )
1110
+ } catch (e) {
1111
+ console.error('Error loading urduhack functions:', e)
1112
+ }
1113
+ }
1114
+
1115
+ normalize (text) {
1116
+ text = this._normalizePunctuations(text)
1117
+ text = this.normalizeWhitespace(text)
1118
+ if (this.removeNuktas) {
1119
+ text = this.removeDiacritics(text)
1120
+ }
1121
+ text = this.normalizeCharacters(text)
1122
+ text = this.normalizeCombineCharacters(text)
1123
+ text = this.digitsSpace(text)
1124
+ text = this.allPunctuationsSpace(text)
1125
+ text = this.englishCharactersSpace(text)
1126
+ return text
1127
+ }
1128
+ }
1129
+
1130
+ class IndicNormalizerFactory {
1131
+ /**
1132
+ * Factory class to create language specific normalizers.
1133
+ */
1134
+
1135
+ /**
1136
+ * Get the language specific normalizer
1137
+ * @param {string} language - Language code
1138
+ * @param {Object} options - Options for normalizer
1139
+ * @returns {NormalizerI} - Language specific normalizer
1140
+ */
1141
+ static getNormalizer (language, options = {}) {
1142
+ let normalizer = null
1143
+ if (['hi', 'mr', 'sa', 'kK', 'ne', 'sd'].includes(language)) {
1144
+ normalizer = new DevanagariNormalizer(language, options)
1145
+ } else if (['ur'].includes(language)) {
1146
+ normalizer = new UrduNormalizer(language, options)
1147
+ } else if (['pa'].includes(language)) {
1148
+ normalizer = new GurmukhiNormalizer(language, options)
1149
+ } else if (['gu'].includes(language)) {
1150
+ normalizer = new GujaratiNormalizer(language, options)
1151
+ } else if (['bn'].includes(language)) {
1152
+ normalizer = new BengaliNormalizer(language, options)
1153
+ } else if (['as'].includes(language)) {
1154
+ normalizer = new BengaliNormalizer(language, options)
1155
+ } else if (['or'].includes(language)) {
1156
+ normalizer = new OriyaNormalizer(language, options)
1157
+ } else if (['ml'].includes(language)) {
1158
+ normalizer = new MalayalamNormalizer(language, options)
1159
+ } else if (['kn'].includes(language)) {
1160
+ normalizer = new KannadaNormalizer(language, options)
1161
+ } else if (['ta'].includes(language)) {
1162
+ normalizer = new TamilNormalizer(language, options)
1163
+ } else if (['te'].includes(language)) {
1164
+ normalizer = new TeluguNormalizer(language, options)
1165
+ } else {
1166
+ normalizer = new BaseNormalizer(language, options)
1167
+ }
1168
+
1169
+ return normalizer
1170
+ }
1171
+
1172
+ /**
1173
+ * Check if a language is supported
1174
+ * @param {string} language - Language code
1175
+ * @returns {boolean} - Whether the language is supported
1176
+ */
1177
+ static isLanguageSupported (language) {
1178
+ return [
1179
+ 'hi',
1180
+ 'mr',
1181
+ 'sa',
1182
+ 'kK',
1183
+ 'ne',
1184
+ 'sd',
1185
+ 'ur',
1186
+ 'pa',
1187
+ 'gu',
1188
+ 'bn',
1189
+ 'as',
1190
+ 'or',
1191
+ 'ml',
1192
+ 'kn',
1193
+ 'ta',
1194
+ 'te'
1195
+ ].includes(language)
1196
+ }
1197
+ }
1198
+
1199
+ module.exports = {
1200
+ NormalizerI,
1201
+ BaseNormalizer,
1202
+ DevanagariNormalizer,
1203
+ GurmukhiNormalizer,
1204
+ GujaratiNormalizer,
1205
+ OriyaNormalizer,
1206
+ BengaliNormalizer,
1207
+ TamilNormalizer,
1208
+ TeluguNormalizer,
1209
+ KannadaNormalizer,
1210
+ MalayalamNormalizer,
1211
+ UrduNormalizer,
1212
+ IndicNormalizerFactory
1213
+ }