@qvac/translation-nmtcpp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +470 -0
- package/binding.js +1 -0
- package/index.d.ts +82 -0
- package/index.js +188 -0
- package/lib/error.js +65 -0
- package/marian.js +186 -0
- package/package.json +69 -0
- package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
- package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
- package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
- package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
- package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
- package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
- package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
- package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
- package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
- package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
- package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
- package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
- package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
- package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
- package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
- package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
- package/third-party/indic-processor.js +565 -0
|
@@ -0,0 +1,1213 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Copyright (c) 2013-present, Anoop Kunchukuttan
|
|
4
|
+
* All rights reserved.
|
|
5
|
+
*
|
|
6
|
+
* This source code is licensed under the MIT license found in the
|
|
7
|
+
* INDIC_NPL_LICENCE file in the indicnlp directory of this source tree.
|
|
8
|
+
*
|
|
9
|
+
* This code is a ported version of the sacremoses library. Please refer to NOTICE
|
|
10
|
+
* file in the root directory of this source tree.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const langinfo = require('./langinfo')
|
|
14
|
+
|
|
15
|
+
class NormalizerI {
|
|
16
|
+
/**
|
|
17
|
+
* The normalizer classes do the following:
|
|
18
|
+
* * Some characters have multiple Unicode codepoints. The normalizer chooses a single standard representation
|
|
19
|
+
* * Some control characters are deleted
|
|
20
|
+
* * While typing using the Latin keyboard, certain typical mistakes occur which are corrected by the module
|
|
21
|
+
* Base class for normalizer. Performs some common normalization, which includes:
|
|
22
|
+
* * Byte order mark, word joiner, etc. removal
|
|
23
|
+
* * ZERO_WIDTH_NON_JOINER and ZERO_WIDTH_JOINER removal
|
|
24
|
+
* * ZERO_WIDTH_SPACE and NO_BREAK_SPACE replaced by spaces
|
|
25
|
+
* Script specific normalizers should derive from this class and override the normalize() method.
|
|
26
|
+
* They can call the super class 'normalize() method to avail of the common normalization
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
static BYTE_ORDER_MARK = '\uFEFF'
|
|
30
|
+
static BYTE_ORDER_MARK_2 = '\uFFFE'
|
|
31
|
+
static WORD_JOINER = '\u2060'
|
|
32
|
+
static SOFT_HYPHEN = '\u00AD'
|
|
33
|
+
|
|
34
|
+
static ZERO_WIDTH_SPACE = '\u200B'
|
|
35
|
+
static NO_BREAK_SPACE = '\u00A0'
|
|
36
|
+
|
|
37
|
+
static ZERO_WIDTH_NON_JOINER = '\u200C'
|
|
38
|
+
static ZERO_WIDTH_JOINER = '\u200D'
|
|
39
|
+
|
|
40
|
+
_normalizePunctuations (text) {
|
|
41
|
+
/**
|
|
42
|
+
* Normalize punctuations.
|
|
43
|
+
* Applied many of the punctuation normalizations that are part of MosesNormalizer
|
|
44
|
+
* from sacremoses
|
|
45
|
+
*/
|
|
46
|
+
text = text.replace(NormalizerI.BYTE_ORDER_MARK, '')
|
|
47
|
+
text = text.replace(/„/g, '"')
|
|
48
|
+
text = text.replace(/"/g, '"')
|
|
49
|
+
text = text.replace(/"/g, '"')
|
|
50
|
+
text = text.replace(/–/g, '-')
|
|
51
|
+
text = text.replace(/—/g, ' - ')
|
|
52
|
+
text = text.replace(/´/g, "'")
|
|
53
|
+
text = text.replace(/'/g, "'")
|
|
54
|
+
text = text.replace(/‚/g, "'")
|
|
55
|
+
text = text.replace(/'/g, "'")
|
|
56
|
+
text = text.replace(/''/g, '"')
|
|
57
|
+
text = text.replace(/´´/g, '"')
|
|
58
|
+
text = text.replace(/…/g, '...')
|
|
59
|
+
|
|
60
|
+
return text
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
normalize (text) {
|
|
64
|
+
// Method to be implemented by subclasses
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
class BaseNormalizer extends NormalizerI {
|
|
69
|
+
/**
|
|
70
|
+
* Base normalizer for Indic scripts
|
|
71
|
+
* @param {string} lang - Language code
|
|
72
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
73
|
+
* @param {string} nasalsMode - How to handle nasal characters ('do_nothing', 'to_anusvaara_strict', 'to_anusvaara_relaxed', 'to_nasal_consonants')
|
|
74
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
75
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
76
|
+
*/
|
|
77
|
+
constructor (
|
|
78
|
+
lang = 'hi',
|
|
79
|
+
removeNuktas = false,
|
|
80
|
+
nasalsMode = 'do_nothing',
|
|
81
|
+
doNormalizeChandras = false,
|
|
82
|
+
doNormalizeVowelEnding = false
|
|
83
|
+
) {
|
|
84
|
+
super()
|
|
85
|
+
this.lang = lang
|
|
86
|
+
this.removeNuktas = removeNuktas
|
|
87
|
+
this.nasalsMode = nasalsMode
|
|
88
|
+
this.doNormalizeChandras = doNormalizeChandras
|
|
89
|
+
this.doNormalizeVowelEnding = doNormalizeVowelEnding
|
|
90
|
+
|
|
91
|
+
this._initNormalizeChandras()
|
|
92
|
+
this._initNormalizeNasals()
|
|
93
|
+
this._initNormalizeVowelEnding()
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
_initNormalizeVowelEnding () {
|
|
97
|
+
if (langinfo.IE_LANGUAGES.includes(this.lang)) {
|
|
98
|
+
this.fnVowelEnding = this._normalizeWordVowelEndingIe
|
|
99
|
+
} else if (langinfo.DRAVIDIAN_LANGUAGES.includes(this.lang)) {
|
|
100
|
+
this.fnVowelEnding = this._normalizeWordVowelEndingDravidian
|
|
101
|
+
} else {
|
|
102
|
+
this.fnVowelEnding = (x) => x
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
_initNormalizeChandras () {
|
|
107
|
+
const substitutionOffsets = [
|
|
108
|
+
[0x0d, 0x0f], // chandra e, independent
|
|
109
|
+
[0x11, 0x13], // chandra o, independent
|
|
110
|
+
[0x45, 0x47], // chandra e, dependent
|
|
111
|
+
[0x49, 0x4b], // chandra o, dependent
|
|
112
|
+
// [0x72, 0x0f], // mr: chandra e, independent
|
|
113
|
+
|
|
114
|
+
[0x00, 0x02], // chandrabindu
|
|
115
|
+
[0x01, 0x02] // chandrabindu
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
this.chandraSubstitutions = substitutionOffsets.map((x) => [
|
|
119
|
+
langinfo.offsetToChar(x[0], this.lang),
|
|
120
|
+
langinfo.offsetToChar(x[1], this.lang)
|
|
121
|
+
])
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
_normalizeChandras (text) {
|
|
125
|
+
for (const [match, repl] of this.chandraSubstitutions) {
|
|
126
|
+
text = text.replace(new RegExp(match, 'g'), repl)
|
|
127
|
+
}
|
|
128
|
+
return text
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
_initToAnusvaaraStrict () {
|
|
132
|
+
const patSignatures = [
|
|
133
|
+
[0x19, 0x15, 0x18],
|
|
134
|
+
[0x1e, 0x1a, 0x1d],
|
|
135
|
+
[0x23, 0x1f, 0x22],
|
|
136
|
+
[0x28, 0x24, 0x27],
|
|
137
|
+
[0x29, 0x24, 0x27],
|
|
138
|
+
[0x2e, 0x2a, 0x2d]
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
const halantOffset = 0x4d
|
|
142
|
+
const anusvaraOffset = 0x02
|
|
143
|
+
|
|
144
|
+
const pats = []
|
|
145
|
+
|
|
146
|
+
for (const patSignature of patSignatures) {
|
|
147
|
+
const pat = new RegExp(
|
|
148
|
+
`${langinfo.offsetToChar(
|
|
149
|
+
patSignature[0],
|
|
150
|
+
this.lang
|
|
151
|
+
)}${langinfo.offsetToChar(
|
|
152
|
+
halantOffset,
|
|
153
|
+
this.lang
|
|
154
|
+
)}([${langinfo.offsetToChar(
|
|
155
|
+
patSignature[1],
|
|
156
|
+
this.lang
|
|
157
|
+
)}-${langinfo.offsetToChar(patSignature[2], this.lang)}])`,
|
|
158
|
+
'g'
|
|
159
|
+
)
|
|
160
|
+
pats.push(pat)
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const replString = `${langinfo.offsetToChar(anusvaraOffset, this.lang)}$1`
|
|
164
|
+
|
|
165
|
+
this.patsRepls = [pats, replString]
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
_toAnusvaaraStrict (text) {
|
|
169
|
+
const [pats, replString] = this.patsRepls
|
|
170
|
+
for (const pat of pats) {
|
|
171
|
+
text = text.replace(pat, replString)
|
|
172
|
+
}
|
|
173
|
+
return text
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
_initToAnusvaaraRelaxed () {
|
|
177
|
+
const nasalsList = [0x19, 0x1e, 0x23, 0x28, 0x29, 0x2e]
|
|
178
|
+
const nasalsListStr = nasalsList
|
|
179
|
+
.map((x) => langinfo.offsetToChar(x, this.lang))
|
|
180
|
+
.join('')
|
|
181
|
+
|
|
182
|
+
const halantOffset = 0x4d
|
|
183
|
+
const anusvaraOffset = 0x02
|
|
184
|
+
|
|
185
|
+
const pat = new RegExp(
|
|
186
|
+
`[${nasalsListStr}]${langinfo.offsetToChar(halantOffset, this.lang)}`,
|
|
187
|
+
'g'
|
|
188
|
+
)
|
|
189
|
+
const replString = langinfo.offsetToChar(anusvaraOffset, this.lang)
|
|
190
|
+
|
|
191
|
+
this.patsRepls = [pat, replString]
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
_toAnusvaaraRelaxed (text) {
|
|
195
|
+
const [pat, replString] = this.patsRepls
|
|
196
|
+
return text.replace(pat, replString)
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
_initToNasalConsonants () {
|
|
200
|
+
const patSignatures = [
|
|
201
|
+
[0x19, 0x15, 0x18],
|
|
202
|
+
[0x1e, 0x1a, 0x1d],
|
|
203
|
+
[0x23, 0x1f, 0x22],
|
|
204
|
+
[0x28, 0x24, 0x27],
|
|
205
|
+
[0x29, 0x24, 0x27],
|
|
206
|
+
[0x2e, 0x2a, 0x2d]
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
const halantOffset = 0x4d
|
|
210
|
+
const anusvaraOffset = 0x02
|
|
211
|
+
|
|
212
|
+
const pats = []
|
|
213
|
+
const replStrings = []
|
|
214
|
+
|
|
215
|
+
for (const patSignature of patSignatures) {
|
|
216
|
+
const pat = new RegExp(
|
|
217
|
+
`${langinfo.offsetToChar(
|
|
218
|
+
anusvaraOffset,
|
|
219
|
+
this.lang
|
|
220
|
+
)}([${langinfo.offsetToChar(
|
|
221
|
+
patSignature[1],
|
|
222
|
+
this.lang
|
|
223
|
+
)}-${langinfo.offsetToChar(patSignature[2], this.lang)}])`,
|
|
224
|
+
'g'
|
|
225
|
+
)
|
|
226
|
+
pats.push(pat)
|
|
227
|
+
|
|
228
|
+
const replString = `${langinfo.offsetToChar(
|
|
229
|
+
patSignature[0],
|
|
230
|
+
this.lang
|
|
231
|
+
)}${langinfo.offsetToChar(halantOffset, this.lang)}$1`
|
|
232
|
+
replStrings.push(replString)
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
this.patsRepls = pats.map((pat, i) => [pat, replStrings[i]])
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
_toNasalConsonants (text) {
|
|
239
|
+
for (const [pat, repl] of this.patsRepls) {
|
|
240
|
+
text = text.replace(pat, repl)
|
|
241
|
+
}
|
|
242
|
+
return text
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
_initNormalizeNasals () {
|
|
246
|
+
if (this.nasalsMode === 'to_anusvaara_strict') {
|
|
247
|
+
this._initToAnusvaaraStrict()
|
|
248
|
+
} else if (this.nasalsMode === 'to_anusvaara_relaxed') {
|
|
249
|
+
this._initToAnusvaaraRelaxed()
|
|
250
|
+
} else if (this.nasalsMode === 'to_nasal_consonants') {
|
|
251
|
+
this._initToNasalConsonants()
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
_normalizeNasals (text) {
|
|
256
|
+
if (this.nasalsMode === 'to_anusvaara_strict') {
|
|
257
|
+
return this._toAnusvaaraStrict(text)
|
|
258
|
+
} else if (this.nasalsMode === 'to_anusvaara_relaxed') {
|
|
259
|
+
return this._toAnusvaaraRelaxed(text)
|
|
260
|
+
} else if (this.nasalsMode === 'to_nasal_consonants') {
|
|
261
|
+
return this._toNasalConsonants(text)
|
|
262
|
+
} else {
|
|
263
|
+
return text
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
_normalizeWordVowelEndingDravidian (word) {
|
|
268
|
+
/**
|
|
269
|
+
* For Dravidian
|
|
270
|
+
* - consonant ending: add 'a' ki maatra
|
|
271
|
+
* - halant ending: no change
|
|
272
|
+
* - 'a' ki maatra: no change
|
|
273
|
+
*/
|
|
274
|
+
if (
|
|
275
|
+
word.length > 0 &&
|
|
276
|
+
langinfo.isConsonant(word.charAt(word.length - 1), this.lang)
|
|
277
|
+
) {
|
|
278
|
+
return word + langinfo.offsetToChar(0x3e, this.lang)
|
|
279
|
+
} else {
|
|
280
|
+
return word
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
_normalizeWordVowelEndingIe (word) {
|
|
285
|
+
/**
|
|
286
|
+
* For IE
|
|
287
|
+
* - consonant ending: add halant
|
|
288
|
+
* - halant ending: no change
|
|
289
|
+
* - 'a' ki maatra: no change
|
|
290
|
+
*/
|
|
291
|
+
if (
|
|
292
|
+
word.length > 0 &&
|
|
293
|
+
langinfo.isConsonant(word.charAt(word.length - 1), this.lang)
|
|
294
|
+
) {
|
|
295
|
+
return word + langinfo.offsetToChar(langinfo.HALANTA_OFFSET, this.lang)
|
|
296
|
+
} else {
|
|
297
|
+
return word
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
_normalizeVowelEnding (text) {
|
|
302
|
+
return text
|
|
303
|
+
.split(' ')
|
|
304
|
+
.map((w) => this.fnVowelEnding(w))
|
|
305
|
+
.join(' ')
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
normalize (text) {
|
|
309
|
+
/**
|
|
310
|
+
* Method to be implemented for normalization for each script
|
|
311
|
+
*/
|
|
312
|
+
text = text.replace(NormalizerI.BYTE_ORDER_MARK, '')
|
|
313
|
+
text = text.replace(NormalizerI.BYTE_ORDER_MARK_2, '')
|
|
314
|
+
text = text.replace(NormalizerI.WORD_JOINER, '')
|
|
315
|
+
text = text.replace(NormalizerI.SOFT_HYPHEN, '')
|
|
316
|
+
|
|
317
|
+
text = text.replace(NormalizerI.ZERO_WIDTH_SPACE, ' ') // ??
|
|
318
|
+
text = text.replace(NormalizerI.NO_BREAK_SPACE, ' ')
|
|
319
|
+
|
|
320
|
+
text = text.replace(NormalizerI.ZERO_WIDTH_NON_JOINER, '')
|
|
321
|
+
text = text.replace(NormalizerI.ZERO_WIDTH_JOINER, '')
|
|
322
|
+
|
|
323
|
+
text = this._normalizePunctuations(text)
|
|
324
|
+
|
|
325
|
+
if (this.doNormalizeChandras) {
|
|
326
|
+
text = this._normalizeChandras(text)
|
|
327
|
+
}
|
|
328
|
+
text = this._normalizeNasals(text)
|
|
329
|
+
if (this.doNormalizeVowelEnding) {
|
|
330
|
+
text = this._normalizeVowelEnding(text)
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
return text
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
getCharStats (text) {
|
|
337
|
+
console.log(
|
|
338
|
+
text.match(new RegExp(NormalizerI.BYTE_ORDER_MARK, 'g'))?.length || 0
|
|
339
|
+
)
|
|
340
|
+
console.log(
|
|
341
|
+
text.match(new RegExp(NormalizerI.BYTE_ORDER_MARK_2, 'g'))?.length || 0
|
|
342
|
+
)
|
|
343
|
+
console.log(
|
|
344
|
+
text.match(new RegExp(NormalizerI.WORD_JOINER, 'g'))?.length || 0
|
|
345
|
+
)
|
|
346
|
+
console.log(
|
|
347
|
+
text.match(new RegExp(NormalizerI.SOFT_HYPHEN, 'g'))?.length || 0
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
console.log(
|
|
351
|
+
text.match(new RegExp(NormalizerI.ZERO_WIDTH_SPACE, 'g'))?.length || 0
|
|
352
|
+
)
|
|
353
|
+
console.log(
|
|
354
|
+
text.match(new RegExp(NormalizerI.NO_BREAK_SPACE, 'g'))?.length || 0
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
console.log(
|
|
358
|
+
text.match(new RegExp(NormalizerI.ZERO_WIDTH_NON_JOINER, 'g'))?.length ||
|
|
359
|
+
0
|
|
360
|
+
)
|
|
361
|
+
console.log(
|
|
362
|
+
text.match(new RegExp(NormalizerI.ZERO_WIDTH_JOINER, 'g'))?.length || 0
|
|
363
|
+
)
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
correctVisarga (text, visargaChar, charRange) {
|
|
367
|
+
return text.replace(/([^\u0900-\u097f]):/g, '$1\u0903')
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
class DevanagariNormalizer extends BaseNormalizer {
|
|
372
|
+
/**
|
|
373
|
+
* Normalizer for the Devanagari script. In addition to basic normalization by the super class,
|
|
374
|
+
* * Replaces the composite characters containing nuktas by their decomposed form
|
|
375
|
+
* * replace pipe character '|' by poorna virama character
|
|
376
|
+
* * replace colon ':' by visarga if the colon follows a charcter in this script
|
|
377
|
+
*/
|
|
378
|
+
|
|
379
|
+
static NUKTA = '\u093C'
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Constructor for DevanagariNormalizer
|
|
383
|
+
* @param {string} lang - Language code
|
|
384
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
385
|
+
* @param {string} nasalsMode - How to handle nasal characters
|
|
386
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
387
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
388
|
+
*/
|
|
389
|
+
constructor (
|
|
390
|
+
lang = 'hi',
|
|
391
|
+
removeNuktas = false,
|
|
392
|
+
nasalsMode = 'do_nothing',
|
|
393
|
+
doNormalizeChandras = false,
|
|
394
|
+
doNormalizeVowelEnding = false
|
|
395
|
+
) {
|
|
396
|
+
super(
|
|
397
|
+
lang,
|
|
398
|
+
removeNuktas,
|
|
399
|
+
nasalsMode,
|
|
400
|
+
doNormalizeChandras,
|
|
401
|
+
doNormalizeVowelEnding
|
|
402
|
+
)
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
normalize (text) {
|
|
406
|
+
// Common normalization for Indic scripts
|
|
407
|
+
text = super.normalize(text)
|
|
408
|
+
|
|
409
|
+
// chandra a replacement for Marathi
|
|
410
|
+
text = text.replace('\u0972', '\u090f')
|
|
411
|
+
|
|
412
|
+
// decomposing Nukta based composite characters
|
|
413
|
+
text = text.replace('\u0929', '\u0928' + DevanagariNormalizer.NUKTA)
|
|
414
|
+
text = text.replace('\u0931', '\u0930' + DevanagariNormalizer.NUKTA)
|
|
415
|
+
text = text.replace('\u0934', '\u0933' + DevanagariNormalizer.NUKTA)
|
|
416
|
+
text = text.replace('\u0958', '\u0915' + DevanagariNormalizer.NUKTA)
|
|
417
|
+
text = text.replace('\u0959', '\u0916' + DevanagariNormalizer.NUKTA)
|
|
418
|
+
text = text.replace('\u095A', '\u0917' + DevanagariNormalizer.NUKTA)
|
|
419
|
+
text = text.replace('\u095B', '\u091C' + DevanagariNormalizer.NUKTA)
|
|
420
|
+
text = text.replace('\u095C', '\u0921' + DevanagariNormalizer.NUKTA)
|
|
421
|
+
text = text.replace('\u095D', '\u0922' + DevanagariNormalizer.NUKTA)
|
|
422
|
+
text = text.replace('\u095E', '\u092B' + DevanagariNormalizer.NUKTA)
|
|
423
|
+
text = text.replace('\u095F', '\u092F' + DevanagariNormalizer.NUKTA)
|
|
424
|
+
|
|
425
|
+
if (this.removeNuktas) {
|
|
426
|
+
text = text.replace(new RegExp(DevanagariNormalizer.NUKTA, 'g'), '')
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// replace pipe character for poorna virama
|
|
430
|
+
text = text.replace('\u007c', '\u0964')
|
|
431
|
+
|
|
432
|
+
// correct visarga
|
|
433
|
+
text = text.replace(/([ऀ-ॿ]):/, '$1\u0903')
|
|
434
|
+
|
|
435
|
+
return text
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
getCharStats (text) {
|
|
439
|
+
super.getCharStats(text)
|
|
440
|
+
|
|
441
|
+
console.log(text.match(/\u0929/g)?.length || 0)
|
|
442
|
+
console.log(text.match(/\u0931/g)?.length || 0)
|
|
443
|
+
console.log(text.match(/\u0934/g)?.length || 0)
|
|
444
|
+
console.log(text.match(/\u0958/g)?.length || 0)
|
|
445
|
+
console.log(text.match(/\u0959/g)?.length || 0)
|
|
446
|
+
console.log(text.match(/\u095A/g)?.length || 0)
|
|
447
|
+
console.log(text.match(/\u095B/g)?.length || 0)
|
|
448
|
+
console.log(text.match(/\u095C/g)?.length || 0)
|
|
449
|
+
console.log(text.match(/\u095D/g)?.length || 0)
|
|
450
|
+
console.log(text.match(/\u095E/g)?.length || 0)
|
|
451
|
+
console.log(text.match(/\u095F/g)?.length || 0)
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
class GurmukhiNormalizer extends BaseNormalizer {
|
|
456
|
+
/**
|
|
457
|
+
* Normalizer for the Gurmukhi script. In addition to basic normalization by the super class,
|
|
458
|
+
* * Replaces the composite characters containing nuktas by their decomposed form
|
|
459
|
+
* * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
|
460
|
+
* * replace pipe character '|' by poorna virama character
|
|
461
|
+
* * replace colon ':' by visarga if the colon follows a charcter in this script
|
|
462
|
+
*/
|
|
463
|
+
|
|
464
|
+
static NUKTA = '\u0A3C'
|
|
465
|
+
|
|
466
|
+
static VOWEL_NORM_MAPS = {
|
|
467
|
+
// http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
|
|
468
|
+
// Table 12-16
|
|
469
|
+
ਅਾ: '\u0a06',
|
|
470
|
+
ੲਿ: '\u0a07',
|
|
471
|
+
ੲੀ: '\u0a08',
|
|
472
|
+
ੳੁ: '\u0a09',
|
|
473
|
+
ੳੂ: '\u0a0a',
|
|
474
|
+
ੲੇ: '\u0a0f',
|
|
475
|
+
ਅੈ: '\u0a10',
|
|
476
|
+
ੳੋ: '\u0a13',
|
|
477
|
+
ਅੌ: '\u0a14'
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
/**
|
|
481
|
+
* Constructor for GurmukhiNormalizer
|
|
482
|
+
* @param {string} lang - Language code
|
|
483
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
484
|
+
* @param {string} nasalsMode - How to handle nasal characters
|
|
485
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
486
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
487
|
+
* @param {boolean} doCanonicalizeAddak - Whether to canonicalize addak
|
|
488
|
+
* @param {boolean} doCanonalizeTippi - Whether to canonicalize tippi
|
|
489
|
+
* @param {boolean} doReplaceVowelBases - Whether to replace vowel bases
|
|
490
|
+
*/
|
|
491
|
+
constructor (
|
|
492
|
+
lang = 'pa',
|
|
493
|
+
removeNuktas = false,
|
|
494
|
+
nasalsMode = 'do_nothing',
|
|
495
|
+
doNormalizeChandras = false,
|
|
496
|
+
doNormalizeVowelEnding = false,
|
|
497
|
+
doCanonicalizeAddak = false,
|
|
498
|
+
doCanonalizeTippi = false,
|
|
499
|
+
doReplaceVowelBases = false
|
|
500
|
+
) {
|
|
501
|
+
super(
|
|
502
|
+
lang,
|
|
503
|
+
removeNuktas,
|
|
504
|
+
nasalsMode,
|
|
505
|
+
doNormalizeChandras,
|
|
506
|
+
doNormalizeVowelEnding
|
|
507
|
+
)
|
|
508
|
+
this.doCanonicalizeAddak = doCanonicalizeAddak
|
|
509
|
+
this.doCanonalizeTippi = doCanonalizeTippi
|
|
510
|
+
this.doReplaceVowelBases = doReplaceVowelBases
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
_normalizeVowels (text) {
|
|
514
|
+
// standard vowel replacements as per suggestions in
|
|
515
|
+
// http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
|
|
516
|
+
// Table 12-16
|
|
517
|
+
for (const [k, v] of Object.entries(GurmukhiNormalizer.VOWEL_NORM_MAPS)) {
|
|
518
|
+
text = text.replace(new RegExp(k, 'g'), v)
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// If these special characters occur without any diacritic, replace them with closet
|
|
522
|
+
// equivalent vowels
|
|
523
|
+
if (this.doReplaceVowelBases) {
|
|
524
|
+
text = text.replace(/\u0a72/g, '\u0a07')
|
|
525
|
+
text = text.replace(/\u0a73/g, '\u0a09')
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
return text
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
normalize (text) {
|
|
532
|
+
// Addak
|
|
533
|
+
if (this.doCanonicalizeAddak) {
|
|
534
|
+
// replace addak+consonant with consonat+halant+consonant
|
|
535
|
+
text = text.replace(/\u0a71(.)/g, '$1\u0a4d$1')
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
// Tippi
|
|
539
|
+
if (this.doCanonalizeTippi) {
|
|
540
|
+
text = text.replace(/\u0a70/g, '\u0a02')
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
// Vowels: Gurumuki has multiple ways of representing independent vowels due
|
|
544
|
+
// to the characters 'iri' and 'ura'.
|
|
545
|
+
text = this._normalizeVowels(text)
|
|
546
|
+
|
|
547
|
+
// common normalization for Indic scripts
|
|
548
|
+
text = super.normalize(text)
|
|
549
|
+
|
|
550
|
+
// decomposing Nukta based composite characters
|
|
551
|
+
text = text.replace('\u0a33', '\u0a32' + GurmukhiNormalizer.NUKTA)
|
|
552
|
+
text = text.replace('\u0a36', '\u0a38' + GurmukhiNormalizer.NUKTA)
|
|
553
|
+
text = text.replace('\u0a59', '\u0a16' + GurmukhiNormalizer.NUKTA)
|
|
554
|
+
text = text.replace('\u0a5a', '\u0a17' + GurmukhiNormalizer.NUKTA)
|
|
555
|
+
text = text.replace('\u0a5b', '\u0a1c' + GurmukhiNormalizer.NUKTA)
|
|
556
|
+
text = text.replace('\u0a5e', '\u0a2b' + GurmukhiNormalizer.NUKTA)
|
|
557
|
+
|
|
558
|
+
if (this.removeNuktas) {
|
|
559
|
+
text = text.replace(new RegExp(GurmukhiNormalizer.NUKTA, 'g'), '')
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// replace the poorna virama codes specific to script
|
|
563
|
+
// with generic Indic script codes
|
|
564
|
+
text = text.replace('\u0a64', '\u0964')
|
|
565
|
+
text = text.replace('\u0a65', '\u0965')
|
|
566
|
+
|
|
567
|
+
// replace pipe character for poorna virama
|
|
568
|
+
text = text.replace('\u007c', '\u0964')
|
|
569
|
+
|
|
570
|
+
// correct visarga
|
|
571
|
+
text = text.replace(/([-]):/, '$1\u0a03')
|
|
572
|
+
|
|
573
|
+
return text
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
class GujaratiNormalizer extends BaseNormalizer {
|
|
578
|
+
/**
|
|
579
|
+
* Normalizer for the Gujarati script. In addition to basic normalization by the super class,
|
|
580
|
+
* * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
|
581
|
+
* * replace colon ':' by visarga if the colon follows a charcter in this script
|
|
582
|
+
*/
|
|
583
|
+
|
|
584
|
+
static NUKTA = '\u0ABC'
|
|
585
|
+
|
|
586
|
+
/**
|
|
587
|
+
* Constructor for GujaratiNormalizer
|
|
588
|
+
* @param {string} lang - Language code
|
|
589
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
590
|
+
* @param {string} nasalsMode - How to handle nasal characters
|
|
591
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
592
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
593
|
+
*/
|
|
594
|
+
constructor (
|
|
595
|
+
lang = 'gu',
|
|
596
|
+
removeNuktas = false,
|
|
597
|
+
nasalsMode = 'do_nothing',
|
|
598
|
+
doNormalizeChandras = false,
|
|
599
|
+
doNormalizeVowelEnding = false
|
|
600
|
+
) {
|
|
601
|
+
super(
|
|
602
|
+
lang,
|
|
603
|
+
removeNuktas,
|
|
604
|
+
nasalsMode,
|
|
605
|
+
doNormalizeChandras,
|
|
606
|
+
doNormalizeVowelEnding
|
|
607
|
+
)
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
normalize (text) {
|
|
611
|
+
// common normalization for Indic scripts
|
|
612
|
+
text = super.normalize(text)
|
|
613
|
+
|
|
614
|
+
// decomposing Nukta based composite characters
|
|
615
|
+
if (this.removeNuktas) {
|
|
616
|
+
text = text.replace(new RegExp(GujaratiNormalizer.NUKTA, 'g'), '')
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
// replace the poorna virama codes specific to script
|
|
620
|
+
// with generic Indic script codes
|
|
621
|
+
text = text.replace('\u0ae4', '\u0964')
|
|
622
|
+
text = text.replace('\u0ae5', '\u0965')
|
|
623
|
+
|
|
624
|
+
// correct visarga
|
|
625
|
+
text = text.replace(/([-૿]):/, '$1\u0a83')
|
|
626
|
+
|
|
627
|
+
return text
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
class OriyaNormalizer extends BaseNormalizer {
|
|
632
|
+
/**
|
|
633
|
+
* Normalizer for the Oriya script. In addition to basic normalization by the super class,
|
|
634
|
+
* * Replaces the composite characters containing nuktas by their decomposed form
|
|
635
|
+
* * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
|
636
|
+
* * Canonicalize two part dependent vowels
|
|
637
|
+
* * Replace 'va' with 'ba'
|
|
638
|
+
* * replace pipe character '|' by poorna virama character
|
|
639
|
+
* * replace colon ':' by visarga if the colon follows a charcter in this script
|
|
640
|
+
*/
|
|
641
|
+
|
|
642
|
+
static NUKTA = '\u0B3C'
|
|
643
|
+
|
|
644
|
+
static VOWEL_NORM_MAPS = {
|
|
645
|
+
// See Table 12-22 in http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
|
|
646
|
+
ଅା: '\u0b06',
|
|
647
|
+
ଏୗ: '\u0b10',
|
|
648
|
+
ଓୗ: '\u0b14'
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
/**
|
|
652
|
+
* Constructor for OriyaNormalizer
|
|
653
|
+
* @param {string} lang - Language code
|
|
654
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
655
|
+
* @param {string} nasalsMode - How to handle nasal characters
|
|
656
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
657
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
658
|
+
* @param {boolean} doRemapWa - Whether to remap wa
|
|
659
|
+
*/
|
|
660
|
+
constructor (
|
|
661
|
+
lang = 'or',
|
|
662
|
+
removeNuktas = false,
|
|
663
|
+
nasalsMode = 'do_nothing',
|
|
664
|
+
doNormalizeChandras = false,
|
|
665
|
+
doNormalizeVowelEnding = false,
|
|
666
|
+
doRemapWa = false
|
|
667
|
+
) {
|
|
668
|
+
super(
|
|
669
|
+
lang,
|
|
670
|
+
removeNuktas,
|
|
671
|
+
nasalsMode,
|
|
672
|
+
doNormalizeChandras,
|
|
673
|
+
doNormalizeVowelEnding
|
|
674
|
+
)
|
|
675
|
+
this.doRemapWa = doRemapWa
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
normalize (text) {
|
|
679
|
+
// common normalization for Indic scripts
|
|
680
|
+
text = super.normalize(text)
|
|
681
|
+
|
|
682
|
+
// standard vowel replacements as per suggestions in Unicode documents
|
|
683
|
+
for (const [k, v] of Object.entries(OriyaNormalizer.VOWEL_NORM_MAPS)) {
|
|
684
|
+
text = text.replace(new RegExp(k, 'g'), v)
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// decomposing Nukta based composite characters
|
|
688
|
+
text = text.replace('\u0b5c', '\u0b21' + OriyaNormalizer.NUKTA)
|
|
689
|
+
text = text.replace('\u0b5d', '\u0b22' + OriyaNormalizer.NUKTA)
|
|
690
|
+
|
|
691
|
+
if (this.removeNuktas) {
|
|
692
|
+
text = text.replace(new RegExp(OriyaNormalizer.NUKTA, 'g'), '')
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
// replace the poorna virama codes specific to script
|
|
696
|
+
// with generic Indic script codes
|
|
697
|
+
text = text.replace('\u0b64', '\u0964')
|
|
698
|
+
text = text.replace('\u0b65', '\u0965')
|
|
699
|
+
|
|
700
|
+
// replace pipe character for poorna virama
|
|
701
|
+
text = text.replace('\u0b7c', '\u0964')
|
|
702
|
+
|
|
703
|
+
// replace wa with ba
|
|
704
|
+
if (this.doRemapWa) {
|
|
705
|
+
text = text.replace('\u0b71', '\u0b2c')
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
// replace va with ba
|
|
709
|
+
// NOTE: documentation (chapter on Indic scripts) and codepoint chart seem contradictory
|
|
710
|
+
// (this applied to wa to ba rule also above)
|
|
711
|
+
text = text.replace('\u0b35', '\u0b2c')
|
|
712
|
+
|
|
713
|
+
// AI dependent vowel sign
|
|
714
|
+
text = text.replace('\u0b47\u0b56', '\u0b58')
|
|
715
|
+
|
|
716
|
+
// two part dependent vowels
|
|
717
|
+
text = text.replace('\u0b47\u0b3e', '\u0b4b')
|
|
718
|
+
text = text.replace('\u0b47\u0b57', '\u0b4c')
|
|
719
|
+
|
|
720
|
+
// additional consonant - not clear how to handle this
|
|
721
|
+
// ignore
|
|
722
|
+
|
|
723
|
+
// correct visarga
|
|
724
|
+
text = text.replace(/([-]):/, '$1\u0b03')
|
|
725
|
+
|
|
726
|
+
return text
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
class BengaliNormalizer extends BaseNormalizer {
|
|
731
|
+
/**
|
|
732
|
+
* Normalizer for the Bengali script. In addition to basic normalization by the super class,
|
|
733
|
+
* * Replaces the composite characters containing nuktas by their decomposed form
|
|
734
|
+
* * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
|
735
|
+
* * Canonicalize two part dependent vowels
|
|
736
|
+
* * replace pipe character '|' by poorna virama character
|
|
737
|
+
* * replace colon ':' by visarga if the colon follows a charcter in this script
|
|
738
|
+
*/
|
|
739
|
+
|
|
740
|
+
static NUKTA = '\u09BC'
|
|
741
|
+
|
|
742
|
+
/**
|
|
743
|
+
* Constructor for BengaliNormalizer
|
|
744
|
+
* @param {string} lang - Language code
|
|
745
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
746
|
+
* @param {string} nasalsMode - How to handle nasal characters
|
|
747
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
748
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
749
|
+
* @param {boolean} doRemapAssameseChars - Whether to remap Assamese characters
|
|
750
|
+
*/
|
|
751
|
+
constructor (
|
|
752
|
+
lang = 'bn',
|
|
753
|
+
removeNuktas = false,
|
|
754
|
+
nasalsMode = 'do_nothing',
|
|
755
|
+
doNormalizeChandras = false,
|
|
756
|
+
doNormalizeVowelEnding = false,
|
|
757
|
+
doRemapAssameseChars = false
|
|
758
|
+
) {
|
|
759
|
+
super(
|
|
760
|
+
lang,
|
|
761
|
+
removeNuktas,
|
|
762
|
+
nasalsMode,
|
|
763
|
+
doNormalizeChandras,
|
|
764
|
+
doNormalizeVowelEnding
|
|
765
|
+
)
|
|
766
|
+
this.doRemapAssameseChars = doRemapAssameseChars
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
normalize (text) {
|
|
770
|
+
// common normalization for Indic scripts
|
|
771
|
+
text = super.normalize(text)
|
|
772
|
+
|
|
773
|
+
// decomposing Nukta based composite characters
|
|
774
|
+
text = text.replace('\u09dc', '\u09a1' + BengaliNormalizer.NUKTA)
|
|
775
|
+
text = text.replace('\u09dd', '\u09a2' + BengaliNormalizer.NUKTA)
|
|
776
|
+
text = text.replace('\u09df', '\u09af' + BengaliNormalizer.NUKTA)
|
|
777
|
+
|
|
778
|
+
if (this.removeNuktas) {
|
|
779
|
+
text = text.replace(new RegExp(BengaliNormalizer.NUKTA, 'g'), '')
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
if (this.doRemapAssameseChars && this.lang === 'as') {
|
|
783
|
+
text = text.replace('\u09f0', '\u09b0') // 'ra' character
|
|
784
|
+
text = text.replace('\u09f1', '\u09ac') // 'va' character
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
// replace the poorna virama codes specific to script
|
|
788
|
+
// with generic Indic script codes
|
|
789
|
+
text = text.replace('\u09e4', '\u0964')
|
|
790
|
+
text = text.replace('\u09e5', '\u0965')
|
|
791
|
+
|
|
792
|
+
// replace pipe character for poorna virama
|
|
793
|
+
text = text.replace('\u007c', '\u0964')
|
|
794
|
+
// replace bengali currency numerator four for poorna virama (it looks similar and is used as a substitute)
|
|
795
|
+
text = text.replace('\u09f7', '\u0964')
|
|
796
|
+
|
|
797
|
+
// two part dependent vowels
|
|
798
|
+
text = text.replace('\u09c7\u09be', '\u09cb')
|
|
799
|
+
text = text.replace('\u09c7\u09d7', '\u09cc')
|
|
800
|
+
|
|
801
|
+
// correct visarga
|
|
802
|
+
text = text.replace(/([ঀ-]):/, '$1\u0983')
|
|
803
|
+
|
|
804
|
+
return text
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
class TamilNormalizer extends BaseNormalizer {
|
|
809
|
+
/**
|
|
810
|
+
* Normalizer for the Tamil script. In addition to basic normalization by the super class,
|
|
811
|
+
* * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
|
812
|
+
* * canonicalize two-part dependent vowel signs
|
|
813
|
+
* * replace colon ':' by visarga if the colon follows a charcter in this script
|
|
814
|
+
*/
|
|
815
|
+
|
|
816
|
+
/**
|
|
817
|
+
* Constructor for TamilNormalizer
|
|
818
|
+
* @param {string} lang - Language code
|
|
819
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
820
|
+
* @param {string} nasalsMode - How to handle nasal characters
|
|
821
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
822
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
823
|
+
*/
|
|
824
|
+
constructor (
|
|
825
|
+
lang = 'ta',
|
|
826
|
+
removeNuktas = false,
|
|
827
|
+
nasalsMode = 'do_nothing',
|
|
828
|
+
doNormalizeChandras = false,
|
|
829
|
+
doNormalizeVowelEnding = false
|
|
830
|
+
) {
|
|
831
|
+
super(
|
|
832
|
+
lang,
|
|
833
|
+
removeNuktas,
|
|
834
|
+
nasalsMode,
|
|
835
|
+
doNormalizeChandras,
|
|
836
|
+
doNormalizeVowelEnding
|
|
837
|
+
)
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
normalize (text) {
|
|
841
|
+
// common normalization for Indic scripts
|
|
842
|
+
text = super.normalize(text)
|
|
843
|
+
|
|
844
|
+
// replace the poorna virama codes specific to script
|
|
845
|
+
// with generic Indic script codes
|
|
846
|
+
text = text.replace('\u0be4', '\u0964')
|
|
847
|
+
text = text.replace('\u0be5', '\u0965')
|
|
848
|
+
|
|
849
|
+
// two part dependent vowels
|
|
850
|
+
text = text.replace('\u0b92\u0bd7', '\u0b94')
|
|
851
|
+
text = text.replace('\u0bc6\u0bbe', '\u0bca')
|
|
852
|
+
text = text.replace('\u0bc7\u0bbe', '\u0bcb')
|
|
853
|
+
text = text.replace('\u0bc6\u0bd7', '\u0bcc')
|
|
854
|
+
|
|
855
|
+
// correct visarga
|
|
856
|
+
text = text.replace(/([-]):/, '$1\u0b83')
|
|
857
|
+
|
|
858
|
+
return text
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
class TeluguNormalizer extends BaseNormalizer {
|
|
863
|
+
/**
|
|
864
|
+
* Normalizer for the Telugu script. In addition to basic normalization by the super class,
|
|
865
|
+
* * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
|
866
|
+
* * canonicalize two-part dependent vowel signs
|
|
867
|
+
* * replace colon ':' by visarga if the colon follows a charcter in this script
|
|
868
|
+
*/
|
|
869
|
+
|
|
870
|
+
/**
|
|
871
|
+
* Constructor for TeluguNormalizer
|
|
872
|
+
* @param {string} lang - Language code
|
|
873
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
874
|
+
* @param {string} nasalsMode - How to handle nasal characters
|
|
875
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
876
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
877
|
+
*/
|
|
878
|
+
constructor (
|
|
879
|
+
lang = 'te',
|
|
880
|
+
removeNuktas = false,
|
|
881
|
+
nasalsMode = 'do_nothing',
|
|
882
|
+
doNormalizeChandras = false,
|
|
883
|
+
doNormalizeVowelEnding = false
|
|
884
|
+
) {
|
|
885
|
+
super(
|
|
886
|
+
lang,
|
|
887
|
+
removeNuktas,
|
|
888
|
+
nasalsMode,
|
|
889
|
+
doNormalizeChandras,
|
|
890
|
+
doNormalizeVowelEnding
|
|
891
|
+
)
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
normalize (text) {
|
|
895
|
+
// common normalization for Indic scripts
|
|
896
|
+
text = super.normalize(text)
|
|
897
|
+
|
|
898
|
+
// replace the poorna virama codes specific to script
|
|
899
|
+
// with generic Indic script codes
|
|
900
|
+
text = text.replace('\u0c64', '\u0964')
|
|
901
|
+
text = text.replace('\u0c65', '\u0965')
|
|
902
|
+
|
|
903
|
+
// dependent vowels
|
|
904
|
+
text = text.replace('\u0c46\u0c56', '\u0c48')
|
|
905
|
+
|
|
906
|
+
// correct visarga
|
|
907
|
+
text = text.replace(/([౦-౿]):/, '$1\u0c03')
|
|
908
|
+
|
|
909
|
+
return text
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
getCharStats (text) {
|
|
913
|
+
// Empty implementation
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
class KannadaNormalizer extends BaseNormalizer {
|
|
918
|
+
/**
|
|
919
|
+
* Normalizer for the Kannada script. In addition to basic normalization by the super class,
|
|
920
|
+
* * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
|
921
|
+
* * canonicalize two-part dependent vowel signs
|
|
922
|
+
* * replace colon ':' by visarga if the colon follows a charcter in this script
|
|
923
|
+
*/
|
|
924
|
+
|
|
925
|
+
/**
|
|
926
|
+
* Constructor for KannadaNormalizer
|
|
927
|
+
* @param {string} lang - Language code
|
|
928
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
929
|
+
* @param {string} nasalsMode - How to handle nasal characters
|
|
930
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
931
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
932
|
+
*/
|
|
933
|
+
constructor (
|
|
934
|
+
lang = 'kn',
|
|
935
|
+
removeNuktas = false,
|
|
936
|
+
nasalsMode = 'do_nothing',
|
|
937
|
+
doNormalizeChandras = false,
|
|
938
|
+
doNormalizeVowelEnding = false
|
|
939
|
+
) {
|
|
940
|
+
super(
|
|
941
|
+
lang,
|
|
942
|
+
removeNuktas,
|
|
943
|
+
nasalsMode,
|
|
944
|
+
doNormalizeChandras,
|
|
945
|
+
doNormalizeVowelEnding
|
|
946
|
+
)
|
|
947
|
+
}
|
|
948
|
+
|
|
949
|
+
normalize (text) {
|
|
950
|
+
// common normalization for Indic scripts
|
|
951
|
+
text = super.normalize(text)
|
|
952
|
+
|
|
953
|
+
// replace the poorna virama codes specific to script
|
|
954
|
+
// with generic Indic script codes
|
|
955
|
+
text = text.replace('\u0ce4', '\u0964')
|
|
956
|
+
text = text.replace('\u0ce5', '\u0965')
|
|
957
|
+
|
|
958
|
+
// dependent vowels
|
|
959
|
+
text = text.replace('\u0cbf\u0cd5', '\u0cc0')
|
|
960
|
+
text = text.replace('\u0cc6\u0cd5', '\u0cc7')
|
|
961
|
+
text = text.replace('\u0cc6\u0cd6', '\u0cc8')
|
|
962
|
+
text = text.replace('\u0cc6\u0cc2', '\u0cca')
|
|
963
|
+
text = text.replace('\u0cca\u0cd5', '\u0ccb')
|
|
964
|
+
|
|
965
|
+
// correct visarga
|
|
966
|
+
text = text.replace(/([ಂ-ೲ]):/, '$1\u0c83')
|
|
967
|
+
|
|
968
|
+
return text
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
class MalayalamNormalizer extends BaseNormalizer {
|
|
973
|
+
/**
|
|
974
|
+
* Normalizer for the Malayalam script. In addition to basic normalization by the super class,
|
|
975
|
+
* * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
|
976
|
+
* * canonicalize two-part dependent vowel signs
|
|
977
|
+
* * Change from old encoding of chillus (till Unicode 5.0) to new encoding
|
|
978
|
+
* * replace colon ':' by visarga if the colon follows a charcter in this script
|
|
979
|
+
*/
|
|
980
|
+
|
|
981
|
+
static CHILLU_CHAR_MAP = {
|
|
982
|
+
ൺ: '\u0d23',
|
|
983
|
+
ൻ: '\u0d28',
|
|
984
|
+
ർ: '\u0d30',
|
|
985
|
+
ൽ: '\u0d32',
|
|
986
|
+
ൾ: '\u0d33',
|
|
987
|
+
ൿ: '\u0d15'
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
_canonicalizeChillus (text) {
|
|
991
|
+
for (const [chillu, char] of Object.entries(
|
|
992
|
+
MalayalamNormalizer.CHILLU_CHAR_MAP
|
|
993
|
+
)) {
|
|
994
|
+
text = text.replace(new RegExp(chillu, 'g'), `${char}\u0d4d`)
|
|
995
|
+
}
|
|
996
|
+
return text
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
_correctGeminatedT (text) {
|
|
1000
|
+
return text.replace('\u0d31\u0d4d\u0d31', '\u0d1f\u0d4d\u0d1f')
|
|
1001
|
+
}
|
|
1002
|
+
|
|
1003
|
+
/**
|
|
1004
|
+
* Constructor for MalayalamNormalizer
|
|
1005
|
+
* @param {string} lang - Language code
|
|
1006
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
1007
|
+
* @param {string} nasalsMode - How to handle nasal characters
|
|
1008
|
+
* @param {boolean} doNormalizeChandras - Whether to normalize chandra characters
|
|
1009
|
+
* @param {boolean} doNormalizeVowelEnding - Whether to normalize vowel endings
|
|
1010
|
+
* @param {boolean} doCanonicalizeChillus - Whether to canonicalize chillus
|
|
1011
|
+
* @param {boolean} doCorrectGeminatedT - Whether to correct geminated T
|
|
1012
|
+
*/
|
|
1013
|
+
constructor (
|
|
1014
|
+
lang = 'ml',
|
|
1015
|
+
removeNuktas = false,
|
|
1016
|
+
nasalsMode = 'do_nothing',
|
|
1017
|
+
doNormalizeChandras = false,
|
|
1018
|
+
doNormalizeVowelEnding = false,
|
|
1019
|
+
doCanonicalizeChillus = false,
|
|
1020
|
+
doCorrectGeminatedT = false
|
|
1021
|
+
) {
|
|
1022
|
+
super(
|
|
1023
|
+
lang,
|
|
1024
|
+
removeNuktas,
|
|
1025
|
+
nasalsMode,
|
|
1026
|
+
doNormalizeChandras,
|
|
1027
|
+
doNormalizeVowelEnding
|
|
1028
|
+
)
|
|
1029
|
+
this.doCanonicalizeChillus = doCanonicalizeChillus
|
|
1030
|
+
this.doCorrectGeminatedT = doCorrectGeminatedT
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
normalize (text) {
|
|
1034
|
+
// Change from old encoding of chillus (till Unicode 5.0) to new encoding
|
|
1035
|
+
text = text.replace('\u0d23\u0d4d\u200d', '\u0d7a')
|
|
1036
|
+
text = text.replace('\u0d28\u0d4d\u200d', '\u0d7b')
|
|
1037
|
+
text = text.replace('\u0d30\u0d4d\u200d', '\u0d7c')
|
|
1038
|
+
text = text.replace('\u0d32\u0d4d\u200d', '\u0d7d')
|
|
1039
|
+
text = text.replace('\u0d33\u0d4d\u200d', '\u0d7e')
|
|
1040
|
+
text = text.replace('\u0d15\u0d4d\u200d', '\u0d7f')
|
|
1041
|
+
|
|
1042
|
+
// Normalize chillus
|
|
1043
|
+
if (this.doCanonicalizeChillus) {
|
|
1044
|
+
text = this._canonicalizeChillus(text)
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
// common normalization for Indic scripts
|
|
1048
|
+
text = super.normalize(text)
|
|
1049
|
+
|
|
1050
|
+
// replace the poorna virama codes specific to script
|
|
1051
|
+
// with generic Indic script codes
|
|
1052
|
+
text = text.replace('\u0d64', '\u0964')
|
|
1053
|
+
text = text.replace('\u0d65', '\u0965')
|
|
1054
|
+
|
|
1055
|
+
// dependent vowels
|
|
1056
|
+
text = text.replace('\u0d46\u0d3e', '\u0d4a')
|
|
1057
|
+
text = text.replace('\u0d47\u0d3e', '\u0d4b')
|
|
1058
|
+
|
|
1059
|
+
// au forms
|
|
1060
|
+
text = text.replace('\u0d46\u0d57', '\u0d4c')
|
|
1061
|
+
text = text.replace('\u0d57', '\u0d4c')
|
|
1062
|
+
|
|
1063
|
+
// correct geminated T
|
|
1064
|
+
if (this.doCorrectGeminatedT) {
|
|
1065
|
+
text = this._correctGeminatedT(text)
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
// correct visarga
|
|
1069
|
+
text = text.replace(/([ം-ൿ]):/, '$1\u0d03')
|
|
1070
|
+
|
|
1071
|
+
return text
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
class UrduNormalizer extends NormalizerI {
|
|
1076
|
+
/**
|
|
1077
|
+
* Uses UrduHack library.
|
|
1078
|
+
* https://docs.urduhack.com/en/stable/_modules/urduhack/normalization/character.html#normalize
|
|
1079
|
+
* @param {string} lang - Language code
|
|
1080
|
+
* @param {boolean} removeNuktas - Whether to remove nukta characters
|
|
1081
|
+
*/
|
|
1082
|
+
constructor (lang, removeNuktas = true) {
|
|
1083
|
+
super()
|
|
1084
|
+
this.lang = lang
|
|
1085
|
+
this.removeNuktas = removeNuktas
|
|
1086
|
+
|
|
1087
|
+
try {
|
|
1088
|
+
// This is a placeholder for the functionality that would be imported from urduhack
|
|
1089
|
+
// In a real implementation, you would need to include equivalent JavaScript functionality
|
|
1090
|
+
this.normalizeWhitespace = (text) => text.replace(/\s+/g, ' ')
|
|
1091
|
+
this.digitsSpace = (text) =>
|
|
1092
|
+
text
|
|
1093
|
+
.replace(/(\d)([^\d\s])/g, '$1 $2')
|
|
1094
|
+
.replace(/([^\d\s])(\d)/g, '$1 $2')
|
|
1095
|
+
this.allPunctuationsSpace = (text) =>
|
|
1096
|
+
text
|
|
1097
|
+
.replace(/([^\w\s])([^\s])/g, '$1 $2')
|
|
1098
|
+
.replace(/([^\s])([^\w\s])/g, '$1 $2')
|
|
1099
|
+
this.englishCharactersSpace = (text) =>
|
|
1100
|
+
text
|
|
1101
|
+
.replace(/([a-zA-Z])([^a-zA-Z\s])/g, '$1 $2')
|
|
1102
|
+
.replace(/([^a-zA-Z\s])([a-zA-Z])/g, '$1 $2')
|
|
1103
|
+
this.removeDiacritics = (text) => text // Placeholder
|
|
1104
|
+
this.normalizeCharacters = (text) => text // Placeholder
|
|
1105
|
+
this.normalizeCombineCharacters = (text) => text // Placeholder
|
|
1106
|
+
|
|
1107
|
+
console.warn(
|
|
1108
|
+
'Warning: UrduNormalizer is using placeholder implementations. For full functionality, equivalent JavaScript implementations of urduhack functions are needed.'
|
|
1109
|
+
)
|
|
1110
|
+
} catch (e) {
|
|
1111
|
+
console.error('Error loading urduhack functions:', e)
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
normalize (text) {
|
|
1116
|
+
text = this._normalizePunctuations(text)
|
|
1117
|
+
text = this.normalizeWhitespace(text)
|
|
1118
|
+
if (this.removeNuktas) {
|
|
1119
|
+
text = this.removeDiacritics(text)
|
|
1120
|
+
}
|
|
1121
|
+
text = this.normalizeCharacters(text)
|
|
1122
|
+
text = this.normalizeCombineCharacters(text)
|
|
1123
|
+
text = this.digitsSpace(text)
|
|
1124
|
+
text = this.allPunctuationsSpace(text)
|
|
1125
|
+
text = this.englishCharactersSpace(text)
|
|
1126
|
+
return text
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
class IndicNormalizerFactory {
|
|
1131
|
+
/**
|
|
1132
|
+
* Factory class to create language specific normalizers.
|
|
1133
|
+
*/
|
|
1134
|
+
|
|
1135
|
+
/**
|
|
1136
|
+
* Get the language specific normalizer
|
|
1137
|
+
* @param {string} language - Language code
|
|
1138
|
+
* @param {Object} options - Options for normalizer
|
|
1139
|
+
* @returns {NormalizerI} - Language specific normalizer
|
|
1140
|
+
*/
|
|
1141
|
+
static getNormalizer (language, options = {}) {
|
|
1142
|
+
let normalizer = null
|
|
1143
|
+
if (['hi', 'mr', 'sa', 'kK', 'ne', 'sd'].includes(language)) {
|
|
1144
|
+
normalizer = new DevanagariNormalizer(language, options)
|
|
1145
|
+
} else if (['ur'].includes(language)) {
|
|
1146
|
+
normalizer = new UrduNormalizer(language, options)
|
|
1147
|
+
} else if (['pa'].includes(language)) {
|
|
1148
|
+
normalizer = new GurmukhiNormalizer(language, options)
|
|
1149
|
+
} else if (['gu'].includes(language)) {
|
|
1150
|
+
normalizer = new GujaratiNormalizer(language, options)
|
|
1151
|
+
} else if (['bn'].includes(language)) {
|
|
1152
|
+
normalizer = new BengaliNormalizer(language, options)
|
|
1153
|
+
} else if (['as'].includes(language)) {
|
|
1154
|
+
normalizer = new BengaliNormalizer(language, options)
|
|
1155
|
+
} else if (['or'].includes(language)) {
|
|
1156
|
+
normalizer = new OriyaNormalizer(language, options)
|
|
1157
|
+
} else if (['ml'].includes(language)) {
|
|
1158
|
+
normalizer = new MalayalamNormalizer(language, options)
|
|
1159
|
+
} else if (['kn'].includes(language)) {
|
|
1160
|
+
normalizer = new KannadaNormalizer(language, options)
|
|
1161
|
+
} else if (['ta'].includes(language)) {
|
|
1162
|
+
normalizer = new TamilNormalizer(language, options)
|
|
1163
|
+
} else if (['te'].includes(language)) {
|
|
1164
|
+
normalizer = new TeluguNormalizer(language, options)
|
|
1165
|
+
} else {
|
|
1166
|
+
normalizer = new BaseNormalizer(language, options)
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
return normalizer
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
/**
|
|
1173
|
+
* Check if a language is supported
|
|
1174
|
+
* @param {string} language - Language code
|
|
1175
|
+
* @returns {boolean} - Whether the language is supported
|
|
1176
|
+
*/
|
|
1177
|
+
static isLanguageSupported (language) {
|
|
1178
|
+
return [
|
|
1179
|
+
'hi',
|
|
1180
|
+
'mr',
|
|
1181
|
+
'sa',
|
|
1182
|
+
'kK',
|
|
1183
|
+
'ne',
|
|
1184
|
+
'sd',
|
|
1185
|
+
'ur',
|
|
1186
|
+
'pa',
|
|
1187
|
+
'gu',
|
|
1188
|
+
'bn',
|
|
1189
|
+
'as',
|
|
1190
|
+
'or',
|
|
1191
|
+
'ml',
|
|
1192
|
+
'kn',
|
|
1193
|
+
'ta',
|
|
1194
|
+
'te'
|
|
1195
|
+
].includes(language)
|
|
1196
|
+
}
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
module.exports = {
|
|
1200
|
+
NormalizerI,
|
|
1201
|
+
BaseNormalizer,
|
|
1202
|
+
DevanagariNormalizer,
|
|
1203
|
+
GurmukhiNormalizer,
|
|
1204
|
+
GujaratiNormalizer,
|
|
1205
|
+
OriyaNormalizer,
|
|
1206
|
+
BengaliNormalizer,
|
|
1207
|
+
TamilNormalizer,
|
|
1208
|
+
TeluguNormalizer,
|
|
1209
|
+
KannadaNormalizer,
|
|
1210
|
+
MalayalamNormalizer,
|
|
1211
|
+
UrduNormalizer,
|
|
1212
|
+
IndicNormalizerFactory
|
|
1213
|
+
}
|