@qvac/translation-nmtcpp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +470 -0
  3. package/binding.js +1 -0
  4. package/index.d.ts +82 -0
  5. package/index.js +188 -0
  6. package/lib/error.js +65 -0
  7. package/marian.js +186 -0
  8. package/package.json +69 -0
  9. package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
  10. package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
  11. package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
  12. package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
  13. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
  14. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
  15. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
  16. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
  17. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
  18. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
  19. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
  20. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
  21. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
  22. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
  23. package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
  24. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
  25. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
  26. package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
  27. package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
  28. package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
  29. package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
  30. package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
  31. package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
  32. package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
  33. package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
  34. package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
  35. package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
  36. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
  37. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
  38. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
  39. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
  40. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
  41. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
  42. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
  43. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
  44. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
  45. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
  46. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
  47. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
  48. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
  49. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
  50. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
  51. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
  52. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
  53. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
  54. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
  55. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
  56. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
  57. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
  58. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
  59. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
  60. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
  61. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
  62. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
  63. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
  64. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
  65. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
  66. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
  67. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
  68. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
  69. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
  70. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
  71. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
  72. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
  73. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
  74. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
  75. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
  76. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
  77. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
  78. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
  79. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
  80. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
  81. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
  82. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
  83. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
  84. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
  85. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
  86. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
  87. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
  88. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
  89. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
  90. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
  91. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
  92. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
  93. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
  94. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
  95. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
  96. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
  97. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
  98. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
  99. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
  100. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
  101. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
  102. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
  103. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
  104. package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
  105. package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
  106. package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
  107. package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
  108. package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
  109. package/third-party/indic-processor.js +565 -0
@@ -0,0 +1,565 @@
1
+ const {
2
+ MosesDetokenizer,
3
+ MosesPunctNormalizer,
4
+ MosesTokenizer
5
+ } = require('./indic-processor-deps/sacremoses')
6
+
7
+ const {
8
+ UnicodeIndicTransliterator,
9
+ IndicNormalizerFactory,
10
+ IndicTokenize,
11
+ IndicDetokenize
12
+ } = require('./indic-processor-deps/indicnlp')
13
+
14
+ /**
15
+ * JavaScript version of the IndicProcessor class
16
+ * Handles preprocessing and postprocessing of Indic language text
17
+ */
18
+ class IndicProcessor {
19
+ /**
20
+ * Constructor for IndicProcessor. Initializes all necessary components.
21
+ * @param {boolean} inference - Whether to use inference mode (default: true)
22
+ */
23
+ constructor (inference = true) {
24
+ this.inference = inference
25
+
26
+ /// ///////////////////////////
27
+ // FLORES -> ISO CODES
28
+ /// ///////////////////////////
29
+ this._floresCodes = {
30
+ asm_Beng: 'as',
31
+ awa_Deva: 'hi',
32
+ ben_Beng: 'bn',
33
+ bho_Deva: 'hi',
34
+ brx_Deva: 'hi',
35
+ doi_Deva: 'hi',
36
+ eng_Latn: 'en',
37
+ gom_Deva: 'kK',
38
+ gon_Deva: 'hi',
39
+ guj_Gujr: 'gu',
40
+ hin_Deva: 'hi',
41
+ hne_Deva: 'hi',
42
+ kan_Knda: 'kn',
43
+ kas_Arab: 'ur',
44
+ kas_Deva: 'hi',
45
+ kha_Latn: 'en',
46
+ lus_Latn: 'en',
47
+ mag_Deva: 'hi',
48
+ mai_Deva: 'hi',
49
+ mal_Mlym: 'ml',
50
+ mar_Deva: 'mr',
51
+ mni_Beng: 'bn',
52
+ mni_Mtei: 'hi',
53
+ npi_Deva: 'ne',
54
+ ory_Orya: 'or',
55
+ pan_Guru: 'pa',
56
+ san_Deva: 'hi',
57
+ sat_Olck: 'or',
58
+ snd_Arab: 'ur',
59
+ snd_Deva: 'hi',
60
+ tam_Taml: 'ta',
61
+ tel_Telu: 'te',
62
+ urd_Arab: 'ur',
63
+ unr_Deva: 'hi'
64
+ }
65
+
66
+ /// ///////////////////////////
67
+ // INDIC DIGIT TRANSLATION
68
+ /// ///////////////////////////
69
+ this._digitsTranslationMap = new Map()
70
+ const digitsDict = {
71
+ '\u09e6': '0',
72
+ '\u0ae6': '0',
73
+ '\u0ce6': '0',
74
+ '\u0966': '0',
75
+ '\u0660': '0',
76
+ '\uabf0': '0',
77
+ '\u0b66': '0',
78
+ '\u0a66': '0',
79
+ '\u1c50': '0',
80
+ '\u06f0': '0',
81
+
82
+ '\u09e7': '1',
83
+ '\u0ae7': '1',
84
+ '\u0967': '1',
85
+ '\u0ce7': '1',
86
+ '\u06f1': '1',
87
+ '\uabf1': '1',
88
+ '\u0b67': '1',
89
+ '\u0a67': '1',
90
+ '\u1c51': '1',
91
+ '\u0c67': '1',
92
+
93
+ '\u09e8': '2',
94
+ '\u0ae8': '2',
95
+ '\u0968': '2',
96
+ '\u0ce8': '2',
97
+ '\u06f2': '2',
98
+ '\uabf2': '2',
99
+ '\u0b68': '2',
100
+ '\u0a68': '2',
101
+ '\u1c52': '2',
102
+ '\u0c68': '2',
103
+
104
+ '\u09e9': '3',
105
+ '\u0ae9': '3',
106
+ '\u0969': '3',
107
+ '\u0ce9': '3',
108
+ '\u06f3': '3',
109
+ '\uabf3': '3',
110
+ '\u0b69': '3',
111
+ '\u0a69': '3',
112
+ '\u1c53': '3',
113
+ '\u0c69': '3',
114
+
115
+ '\u09ea': '4',
116
+ '\u0aea': '4',
117
+ '\u096a': '4',
118
+ '\u0cea': '4',
119
+ '\u06f4': '4',
120
+ '\uabf4': '4',
121
+ '\u0b6a': '4',
122
+ '\u0a6a': '4',
123
+ '\u1c54': '4',
124
+ '\u0c6a': '4',
125
+
126
+ '\u09eb': '5',
127
+ '\u0aeb': '5',
128
+ '\u096b': '5',
129
+ '\u0ceb': '5',
130
+ '\u06f5': '5',
131
+ '\uabf5': '5',
132
+ '\u0b6b': '5',
133
+ '\u0a6b': '5',
134
+ '\u1c55': '5',
135
+ '\u0c6b': '5',
136
+
137
+ '\u09ec': '6',
138
+ '\u0aec': '6',
139
+ '\u096c': '6',
140
+ '\u0cec': '6',
141
+ '\u06f6': '6',
142
+ '\uabf6': '6',
143
+ '\u0b6c': '6',
144
+ '\u0a6c': '6',
145
+ '\u1c56': '6',
146
+ '\u0c6c': '6',
147
+
148
+ '\u09ed': '7',
149
+ '\u0aed': '7',
150
+ '\u096d': '7',
151
+ '\u0ced': '7',
152
+ '\u06f7': '7',
153
+ '\uabf7': '7',
154
+ '\u0b6d': '7',
155
+ '\u0a6d': '7',
156
+ '\u1c57': '7',
157
+ '\u0c6d': '7',
158
+
159
+ '\u09ee': '8',
160
+ '\u0aee': '8',
161
+ '\u096e': '8',
162
+ '\u0cee': '8',
163
+ '\u06f8': '8',
164
+ '\uabf8': '8',
165
+ '\u0b6e': '8',
166
+ '\u0a6e': '8',
167
+ '\u1c58': '8',
168
+ '\u0c6e': '8',
169
+
170
+ '\u09ef': '9',
171
+ '\u0aef': '9',
172
+ '\u096f': '9',
173
+ '\u0cef': '9',
174
+ '\u06f9': '9',
175
+ '\uabf9': '9',
176
+ '\u0b6f': '9',
177
+ '\u0a6f': '9',
178
+ '\u1c59': '9',
179
+ '\u0c6f': '9'
180
+ }
181
+
182
+ for (const [k, v] of Object.entries(digitsDict)) {
183
+ this._digitsTranslationMap.set(k, v)
184
+ }
185
+
186
+ // Also map ASCII '0'-'9'
187
+ for (let c = '0'.charCodeAt(0); c <= '9'.charCodeAt(0); c++) {
188
+ this._digitsTranslationMap.set(
189
+ String.fromCharCode(c),
190
+ String.fromCharCode(c)
191
+ )
192
+ }
193
+
194
+ /// ///////////////////////////
195
+ // PLACEHOLDER MAP QUEUE
196
+ /// ///////////////////////////
197
+ this._placeholderEntityMaps = []
198
+
199
+ /// ///////////////////////////
200
+ // Dependency Imports
201
+ // Note: In a real implementation, these would be imported from their respective modules
202
+ /// ///////////////////////////
203
+ this._enTok = new MosesTokenizer('en')
204
+ this._enNormalizer = new MosesPunctNormalizer('en')
205
+ this._enDetok = new MosesDetokenizer('en')
206
+ this._xliterator = UnicodeIndicTransliterator
207
+
208
+ // These would normally be imported from indicnlp
209
+ this._indicTokenize = IndicTokenize
210
+ this._indicDetokenize = IndicDetokenize
211
+ this._indicNormalizerFactory = IndicNormalizerFactory
212
+
213
+ /// ///////////////////////////
214
+ // Precompiled Patterns
215
+ /// ///////////////////////////
216
+ this._MULTISPACE_REGEX = /[ ]{2,}/g
217
+ this._DIGIT_SPACE_PERCENT = /(\d) %/g
218
+ this._DOUBLE_QUOT_PUNC = /"([,.]+)/g
219
+ this._DIGIT_NBSP_DIGIT = /(\d) (\d)/g
220
+ this._END_BRACKET_SPACE_PUNC_REGEX = /\) ([.!:?;,])/g
221
+
222
+ this._URL_PATTERN =
223
+ /\b(?<![\w/.])(?:(?:https?|ftp):\/\/)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b/g
224
+ this._NUMERAL_PATTERN =
225
+ /(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-/.,:']\d+[-/.,:'+]\d+(?:\.\d+)?|\d+[-/.:'+]\d+(?:\.\d+)?)/g
226
+ this._EMAIL_PATTERN = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}/g
227
+ this._OTHER_PATTERN = /[A-Za-z0-9]*[#|@]\w+/g
228
+
229
+ // Combined punctuation replacements
230
+ this._PUNC_REPLACEMENTS = [
231
+ [/\r/g, ''],
232
+ [/\(\s*/g, '('],
233
+ [/\s*\)/g, ')'],
234
+ [/\s:\s?/g, ':'],
235
+ [/\s;\s?/g, ';'],
236
+ [/[`´'‚']/g, "'"],
237
+ [/[„""«»]/g, '"'],
238
+ [/[–—]/g, '-'],
239
+ [/\.\.\./g, '...'],
240
+ [/ %/g, '%'],
241
+ [/nº /g, 'nº '],
242
+ [/ ºC/g, ' ºC'],
243
+ [/ [?!;]/g, (m) => m[0].trim()],
244
+ [/, /g, ', ']
245
+ ]
246
+
247
+ this._INDIC_FAILURE_CASES = [
248
+ 'آی ڈی ',
249
+ 'ꯑꯥꯏꯗꯤ',
250
+ 'आईडी',
251
+ 'आई . डी . ',
252
+ 'आई . डी .',
253
+ 'आई. डी. ',
254
+ 'आई. डी.',
255
+ 'आय. डी. ',
256
+ 'आय. डी.',
257
+ 'आय . डी . ',
258
+ 'आय . डी .',
259
+ 'ऐटि',
260
+ 'آئی ڈی ',
261
+ 'ᱟᱭᱰᱤ ᱾',
262
+ 'आयडी',
263
+ 'ऐडि',
264
+ 'आइडि',
265
+ 'ᱟᱭᱰᱤ'
266
+ ]
267
+ }
268
+
269
+ /**
270
+ * Apply punctuation replacements to text
271
+ * @private
272
+ * @param {string} text - Text to process
273
+ * @param {Array} replacements - Array of [pattern, replacement] pairs
274
+ * @returns {string} - Processed text
275
+ */
276
+ _applyPuncReplacements (text, replacements) {
277
+ for (const [pattern, replacement] of replacements) {
278
+ text = text.replace(pattern, replacement)
279
+ }
280
+ return text
281
+ }
282
+
283
+ /**
284
+ * Normalize punctuation in text
285
+ * @private
286
+ * @param {string} text - Text to normalize
287
+ * @returns {string} - Normalized text
288
+ */
289
+ _puncNorm (text) {
290
+ // 1) Apply replacements
291
+ text = this._applyPuncReplacements(text, this._PUNC_REPLACEMENTS)
292
+
293
+ // 2) Additional patterns
294
+ text = text.replace(this._MULTISPACE_REGEX, ' ')
295
+ text = text.replace(this._END_BRACKET_SPACE_PUNC_REGEX, ')$1')
296
+ text = text.replace(this._DIGIT_SPACE_PERCENT, '$1%')
297
+ text = text.replace(this._DOUBLE_QUOT_PUNC, '$1"')
298
+ text = text.replace(this._DIGIT_NBSP_DIGIT, '$1.$2')
299
+ return text.trim()
300
+ }
301
+
302
+ /**
303
+ * Wrap substrings with matched patterns in the text with placeholders
304
+ * @private
305
+ * @param {string} text - Text to process
306
+ * @returns {string} - Text with placeholders
307
+ */
308
+ _wrapWithPlaceholders (text) {
309
+ let serialNo = 1
310
+ const placeholderEntityMap = {}
311
+ const patterns = [
312
+ this._EMAIL_PATTERN,
313
+ this._URL_PATTERN,
314
+ this._NUMERAL_PATTERN,
315
+ this._OTHER_PATTERN
316
+ ]
317
+
318
+ for (const pattern of patterns) {
319
+ // Reset lastIndex to ensure we find all matches
320
+ pattern.lastIndex = 0
321
+
322
+ // Find all matches of this pattern
323
+ const matches = new Set()
324
+ let match
325
+ while ((match = pattern.exec(text)) !== null) {
326
+ matches.add(match[0])
327
+ }
328
+
329
+ for (const match of matches) {
330
+ // Additional checks
331
+ if (pattern === this._URL_PATTERN) {
332
+ if (match.replace(/\./g, '').length < 4) {
333
+ continue
334
+ }
335
+ }
336
+ if (pattern === this._NUMERAL_PATTERN) {
337
+ if (
338
+ match.replace(/\s/g, '').replace(/\./g, '').replace(/:/g, '')
339
+ .length < 4
340
+ ) {
341
+ continue
342
+ }
343
+ }
344
+
345
+ const basePlaceholder = `<ID${serialNo}>`
346
+ // Map various placeholder formats to the matched text
347
+ placeholderEntityMap[`<ID${serialNo}>`] = match
348
+ placeholderEntityMap[`< ID${serialNo} >`] = match
349
+ placeholderEntityMap[`[ID${serialNo}]`] = match
350
+ placeholderEntityMap[`[ ID${serialNo} ]`] = match
351
+ placeholderEntityMap[`[ID ${serialNo}]`] = match
352
+ placeholderEntityMap[`<ID${serialNo}]`] = match
353
+ placeholderEntityMap[`< ID${serialNo}]`] = match
354
+ placeholderEntityMap[`<ID${serialNo} ]`] = match
355
+
356
+ // Handle Indic failure cases
357
+ for (const indicCase of this._INDIC_FAILURE_CASES) {
358
+ placeholderEntityMap[`<${indicCase}${serialNo}>`] = match
359
+ placeholderEntityMap[`< ${indicCase}${serialNo} >`] = match
360
+ placeholderEntityMap[`< ${indicCase} ${serialNo} >`] = match
361
+ placeholderEntityMap[`<${indicCase} ${serialNo}]`] = match
362
+ placeholderEntityMap[`< ${indicCase} ${serialNo} ]`] = match
363
+ placeholderEntityMap[`[${indicCase}${serialNo}]`] = match
364
+ placeholderEntityMap[`[${indicCase} ${serialNo}]`] = match
365
+ placeholderEntityMap[`[ ${indicCase}${serialNo} ]`] = match
366
+ placeholderEntityMap[`[ ${indicCase} ${serialNo} ]`] = match
367
+ placeholderEntityMap[`${indicCase} ${serialNo}`] = match
368
+ placeholderEntityMap[`${indicCase}${serialNo}`] = match
369
+ }
370
+
371
+ // Replace the match with the base placeholder
372
+ text = text.replace(match, basePlaceholder)
373
+ serialNo += 1
374
+ }
375
+ }
376
+
377
+ // Clean up any remaining placeholder artifacts
378
+ text = text.replace(/\s+/g, ' ').replace('>/', '>').replace(']/', ']')
379
+ this._placeholderEntityMaps.push(placeholderEntityMap)
380
+ return text
381
+ }
382
+
383
+ /**
384
+ * Normalize text by translating numerals and optionally wrapping placeholders
385
+ * @private
386
+ * @param {string} text - Text to normalize
387
+ * @returns {string} - Normalized text
388
+ */
389
+ _normalize (text) {
390
+ // Translate digits to Latin numerals
391
+ let normalizedText = ''
392
+ for (const char of text) {
393
+ normalizedText += this._digitsTranslationMap.get(char) || char
394
+ }
395
+
396
+ if (this.inference) {
397
+ normalizedText = this._wrapWithPlaceholders(normalizedText)
398
+ }
399
+ return normalizedText
400
+ }
401
+
402
+ /**
403
+ * Helper method: normalizes, tokenizes, optionally transliterates from iso_lang -> 'hi'
404
+ * @private
405
+ * @param {string} sentence - Input sentence
406
+ * @param {Object} normalizer - Language normalizer
407
+ * @param {string} isoLang - ISO language code
408
+ * @param {boolean} transliterate - Whether to transliterate
409
+ * @returns {string} - Processed text
410
+ */
411
+ _doIndicTokenizeAndTransliterate (
412
+ sentence,
413
+ normalizer,
414
+ isoLang,
415
+ transliterate
416
+ ) {
417
+ const normed = normalizer.normalize(sentence.trim())
418
+ const tokens = this._indicTokenize.trivialTokenize(normed, isoLang)
419
+ const joined = tokens.join(' ')
420
+
421
+ if (!transliterate) {
422
+ return joined
423
+ }
424
+
425
+ const xlated = this._xliterator.transliterate(joined, isoLang, 'hi')
426
+ return xlated.replace(' ् ', '्')
427
+ }
428
+
429
+ /**
430
+ * Preprocess a single sentence
431
+ * @private
432
+ * @param {string} sent - Input sentence
433
+ * @param {string} srcLang - Source language code
434
+ * @param {string} tgtLang - Target language code
435
+ * @param {Object} normalizer - Language normalizer
436
+ * @param {boolean} isTarget - Whether this is a target sentence
437
+ * @returns {string} - Preprocessed sentence
438
+ */
439
+ _preprocess (sent, srcLang, tgtLang, normalizer, isTarget) {
440
+ const isoLang = this._floresCodes[srcLang] || 'hi'
441
+ const scriptPart = srcLang.split('_')[1]
442
+ let doTransliterate = true
443
+
444
+ // 1) Punctuation normalization
445
+ sent = this._puncNorm(sent)
446
+
447
+ // 2) Numerals & placeholders
448
+ sent = this._normalize(sent)
449
+
450
+ if (['Arab', 'Aran', 'Olck', 'Mtei', 'Latn'].includes(scriptPart)) {
451
+ doTransliterate = false
452
+ }
453
+
454
+ let processedSent
455
+ if (isoLang === 'en') {
456
+ // English path
457
+ const eStrip = sent.trim()
458
+ const eNorm = this._enNormalizer.normalize(eStrip)
459
+ const eTokens = this._enTok.tokenize(eNorm, false, false, false)
460
+ processedSent = eTokens.join(' ')
461
+ } else {
462
+ // Indic path
463
+ processedSent = this._doIndicTokenizeAndTransliterate(
464
+ sent,
465
+ normalizer,
466
+ isoLang,
467
+ doTransliterate
468
+ )
469
+ }
470
+
471
+ processedSent = processedSent.trim()
472
+ if (!isTarget) {
473
+ return `${srcLang} ${tgtLang} ${processedSent}`
474
+ } else {
475
+ return processedSent
476
+ }
477
+ }
478
+
479
+ /**
480
+ * Postprocess a single sentence
481
+ * @private
482
+ * @param {string|Array} sent - Input sentence or array with sentence
483
+ * @param {string} lang - Language code
484
+ * @returns {string} - Postprocessed sentence
485
+ */
486
+ _postprocess (sent, lang) {
487
+ // Unwrap if sent is a tuple or list
488
+ if (Array.isArray(sent)) {
489
+ sent = sent[0]
490
+ }
491
+
492
+ const placeholderEntityMap = this._placeholderEntityMaps.length ? this._placeholderEntityMaps[0] : undefined
493
+ const [langCode, scriptCode] = lang.split('_', 2)
494
+ const isoLang = this._floresCodes[lang] || 'hi'
495
+
496
+ // Fix for Perso-Arabic scripts
497
+ if (['Arab', 'Aran'].includes(scriptCode)) {
498
+ sent = sent
499
+ .replace(' ؟', '؟')
500
+ .replace(' ۔', '۔')
501
+ .replace(' ،', '،')
502
+ .replace('ٮ۪', 'ؠ')
503
+ }
504
+
505
+ // Oriya fix
506
+ if (langCode === 'ory') {
507
+ sent = sent.replace('ଯ଼', 'ୟ')
508
+ }
509
+
510
+ // Restore placeholders
511
+ if (placeholderEntityMap) {
512
+ for (const [k, v] of Object.entries(placeholderEntityMap)) {
513
+ sent = sent.replace(k, v)
514
+ }
515
+ }
516
+
517
+ // Detokenize
518
+ if (lang === 'eng_Latn') {
519
+ return this._enDetok.detokenize(sent.split(' '))
520
+ } else {
521
+ const xlated = this._xliterator.transliterate(sent, 'hi', isoLang)
522
+ return this._indicDetokenize.trivialDetokenize(xlated, isoLang)
523
+ }
524
+ }
525
+
526
+ /**
527
+ * Preprocess a batch of sentences (normalize, tokenize, transliterate)
528
+ * @public
529
+ * @param {Array<string>} batch - Array of sentences
530
+ * @param {string} srcLang - Source language code
531
+ * @param {string} tgtLang - Target language code (optional)
532
+ * @param {boolean} isTarget - Whether these are target sentences
533
+ * @returns {Array<string>} - Preprocessed sentences
534
+ */
535
+ preprocessBatch (
536
+ batch,
537
+ srcLang,
538
+ tgtLang = 'hin_Deva',
539
+ isTarget = false
540
+ ) {
541
+ let normalizer = null
542
+ const isoCode = this._floresCodes[srcLang] || 'hi'
543
+
544
+ if (srcLang !== 'eng_Latn') {
545
+ normalizer = this._indicNormalizerFactory.getNormalizer(isoCode)
546
+ }
547
+
548
+ return batch.map((s) =>
549
+ this._preprocess(s, srcLang, tgtLang, normalizer, isTarget)
550
+ )
551
+ }
552
+
553
+ /**
554
+ * Postprocess a batch of sentences
555
+ * @public
556
+ * @param {Array<string>} sents - Array of sentences
557
+ * @param {string} lang - Language code
558
+ * @returns {Array<string>} - Postprocessed sentences
559
+ */
560
+ postprocessBatch (sents, lang = 'hin_Deva') {
561
+ return sents.map((s) => this._postprocess(s, lang))
562
+ }
563
+ }
564
+
565
+ module.exports = { IndicProcessor }