@qvac/translation-nmtcpp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +470 -0
  3. package/binding.js +1 -0
  4. package/index.d.ts +82 -0
  5. package/index.js +188 -0
  6. package/lib/error.js +65 -0
  7. package/marian.js +186 -0
  8. package/package.json +69 -0
  9. package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
  10. package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
  11. package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
  12. package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
  13. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
  14. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
  15. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
  16. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
  17. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
  18. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
  19. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
  20. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
  21. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
  22. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
  23. package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
  24. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
  25. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
  26. package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
  27. package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
  28. package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
  29. package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
  30. package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
  31. package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
  32. package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
  33. package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
  34. package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
  35. package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
  36. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
  37. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
  38. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
  39. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
  40. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
  41. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
  42. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
  43. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
  44. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
  45. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
  46. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
  47. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
  48. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
  49. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
  50. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
  51. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
  52. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
  53. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
  54. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
  55. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
  56. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
  57. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
  58. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
  59. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
  60. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
  61. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
  62. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
  63. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
  64. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
  65. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
  66. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
  67. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
  68. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
  69. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
  70. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
  71. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
  72. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
  73. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
  74. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
  75. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
  76. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
  77. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
  78. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
  79. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
  80. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
  81. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
  82. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
  83. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
  84. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
  85. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
  86. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
  87. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
  88. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
  89. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
  90. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
  91. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
  92. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
  93. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
  94. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
  95. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
  96. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
  97. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
  98. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
  99. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
  100. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
  101. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
  102. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
  103. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
  104. package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
  105. package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
  106. package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
  107. package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
  108. package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
  109. package/third-party/indic-processor.js +565 -0
@@ -0,0 +1,1217 @@
1
+ /**
2
+ * JavaScript port of the Moses Tokenizer from
3
+ * https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
4
+ */
5
+
6
+ const { Perluniprops, NonbreakingPrefixes } = require('./pernuliprops')
7
+ const { VIRAMAS, NUKTAS } = require('./indic')
8
+ const { isCJK } = require('./cjk')
9
+
10
+ /**
11
+ * MosesTokenizer class for tokenizing text in various languages
12
+ */
13
+ /**
14
+ * JavaScript port of the Moses Tokenizer from
15
+ * https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
16
+ */
17
+
18
+ class MosesTokenizer {
19
+ /**
20
+ * Initialize a new Moses Tokenizer
21
+ * @param {string} lang - Language code (default: "en")
22
+ * @param {string|null} customNonbreakingPrefixesFile - Path to custom prefixes file
23
+ */
24
+ constructor (lang = 'en', customNonbreakingPrefixesFile = null) {
25
+ this.lang = lang
26
+
27
+ // Initialize Perluniprops and NonbreakingPrefixes
28
+ this.perluniprops = new Perluniprops()
29
+ this.nonbreaking_prefixes = new NonbreakingPrefixes()
30
+
31
+ // Perl Unicode Properties character sets.
32
+ // Note: In JavaScript we'll convert the generator to arrays/strings for regex use
33
+ this.IsN = this._joinFromGenerator(this.perluniprops.chars('IsN'))
34
+
35
+ // Build IsAlnum with VIRAMAS and NUKTAS
36
+ const alnumChars = this._joinFromGenerator(
37
+ this.perluniprops.chars('IsAlnum')
38
+ )
39
+ this.IsAlnum = alnumChars + VIRAMAS + NUKTAS
40
+
41
+ this.IsSc = this._joinFromGenerator(this.perluniprops.chars('IsSc'))
42
+ this.IsSo = this._joinFromGenerator(this.perluniprops.chars('IsSo'))
43
+
44
+ // Build IsAlpha with VIRAMAS and NUKTAS
45
+ const alphaChars = this._joinFromGenerator(
46
+ this.perluniprops.chars('IsAlpha')
47
+ )
48
+ this.IsAlpha = alphaChars + VIRAMAS + NUKTAS
49
+
50
+ this.IsLower = this._joinFromGenerator(this.perluniprops.chars('IsLower'))
51
+
52
+ // Remove ASCII junk.
53
+ this.DEDUPLICATE_SPACE = [/\s+/g, ' ']
54
+ // eslint-disable-next-line no-control-regex
55
+ this.ASCII_JUNK = [/[\u0000-\u001F]/g, '']
56
+
57
+ // Pad all "other" special characters not in IsAlnum.
58
+ this.PAD_NOT_ISALNUM = [
59
+ new RegExp(
60
+ `([^${this._escapeRegExp(this.IsAlnum)}\\s\\.'\`\\,\\-])`,
61
+ 'g'
62
+ ),
63
+ ' $1 '
64
+ ]
65
+
66
+ // Splits all hyphens (regardless of circumstances), e.g. 'foo-bar' -> 'foo @-@ bar'
67
+ this.AGGRESSIVE_HYPHEN_SPLIT = [
68
+ new RegExp(
69
+ `([${this._escapeRegExp(this.IsAlnum)}])\\-(?=[${this._escapeRegExp(
70
+ this.IsAlnum
71
+ )}])`,
72
+ 'g'
73
+ ),
74
+ '$1 @-@ '
75
+ ]
76
+
77
+ // Make multi-dots stay together.
78
+ this.REPLACE_DOT_WITH_LITERALSTRING_1 = [/.([.]+)/g, ' DOTMULTI$1']
79
+ this.REPLACE_DOT_WITH_LITERALSTRING_2 = [
80
+ /DOTMULTI\.([^.])/,
81
+ 'DOTDOTMULTI $1'
82
+ ]
83
+ this.REPLACE_DOT_WITH_LITERALSTRING_3 = [/DOTMULTI\./g, 'DOTDOTMULTI']
84
+
85
+ // Separate out "," except if within numbers (5,300)
86
+ this.COMMA_SEPARATE_1 = [
87
+ new RegExp(`([^${this._escapeRegExp(this.IsN)}])[,]`, 'g'),
88
+ '$1 , '
89
+ ]
90
+ this.COMMA_SEPARATE_2 = [
91
+ new RegExp(`[,]([^${this._escapeRegExp(this.IsN)}])`, 'g'),
92
+ ' , $1'
93
+ ]
94
+ this.COMMA_SEPARATE_3 = [
95
+ new RegExp(`([${this._escapeRegExp(this.IsN)}])[,]$`, 'g'),
96
+ '$1 , '
97
+ ]
98
+
99
+ // Attempt to get correct directional quotes.
100
+ this.DIRECTIONAL_QUOTE_1 = [/^``/g, '`` ']
101
+ this.DIRECTIONAL_QUOTE_2 = [/^"/g, '`` ']
102
+ this.DIRECTIONAL_QUOTE_3 = [/^`([^`])/g, '` $1']
103
+ this.DIRECTIONAL_QUOTE_4 = [/^'/g, '` ']
104
+ this.DIRECTIONAL_QUOTE_5 = [/([ ([{<])"/g, '$1 `` ']
105
+ this.DIRECTIONAL_QUOTE_6 = [/([ ([{<])``/g, '$1 `` ']
106
+ this.DIRECTIONAL_QUOTE_7 = [/([ ([{<])`([^`])/g, '$1 ` $2']
107
+ this.DIRECTIONAL_QUOTE_8 = [/([ ([{<])'/g, '$1 ` ']
108
+
109
+ // Replace ... with _ELLIPSIS_ and later restore
110
+ this.REPLACE_ELLIPSIS = [/\.\.\./g, ' _ELLIPSIS_ ']
111
+ this.RESTORE_ELLIPSIS = [/_ELLIPSIS_/g, '...']
112
+
113
+ // Pad , with tailing space except if within numbers, e.g. 5,300
114
+ this.COMMA_1 = [
115
+ new RegExp(
116
+ `([^${this._escapeRegExp(this.IsN)}])[,]([^${this._escapeRegExp(
117
+ this.IsN
118
+ )}])`,
119
+ 'g'
120
+ ),
121
+ '$1 , $2'
122
+ ]
123
+ this.COMMA_2 = [
124
+ new RegExp(
125
+ `([${this._escapeRegExp(this.IsN)}])[,]([^${this._escapeRegExp(
126
+ this.IsN
127
+ )}])`,
128
+ 'g'
129
+ ),
130
+ '$1 , $2'
131
+ ]
132
+ this.COMMA_3 = [
133
+ new RegExp(
134
+ `([^${this._escapeRegExp(this.IsN)}])[,]([${this._escapeRegExp(
135
+ this.IsN
136
+ )}])`,
137
+ 'g'
138
+ ),
139
+ '$1 , $2'
140
+ ]
141
+
142
+ // Pad unicode symbols with spaces.
143
+ this.SYMBOLS = [
144
+ new RegExp(
145
+ `([;:@#\\$%&${this._escapeRegExp(this.IsSc)}${this._escapeRegExp(
146
+ this.IsSo
147
+ )}])`,
148
+ 'g'
149
+ ),
150
+ ' $1 '
151
+ ]
152
+
153
+ // Separate out intra-token slashes.
154
+ this.INTRATOKEN_SLASHES = [
155
+ new RegExp(
156
+ `([${this._escapeRegExp(this.IsAlnum)}])\\/([${this._escapeRegExp(
157
+ this.IsAlnum
158
+ )}])`,
159
+ 'g'
160
+ ),
161
+ '$1 @/@ $2'
162
+ ]
163
+
164
+ // Splits final period at end of string.
165
+ this.FINAL_PERIOD = [/([^.])([.])([\\]\)}>"']*) ?$/g, '$1 $2$3']
166
+
167
+ // Pad all question marks and exclamation marks with spaces.
168
+ this.PAD_QUESTION_EXCLAMATION_MARK = [/([?!])/g, ' $1 ']
169
+
170
+ // Handles parentheses, brackets and converts them to PTB symbols.
171
+ this.PAD_PARENTHESIS = [/([\][(){}<>])/g, ' $1 ']
172
+ this.CONVERT_PARENTHESIS_1 = [/\(/g, '-LRB-']
173
+ this.CONVERT_PARENTHESIS_2 = [/\)/g, '-RRB-']
174
+ this.CONVERT_PARENTHESIS_3 = [/\[/g, '-LSB-']
175
+ this.CONVERT_PARENTHESIS_4 = [/\]/g, '-RSB-']
176
+ this.CONVERT_PARENTHESIS_5 = [/\{/g, '-LCB-']
177
+ this.CONVERT_PARENTHESIS_6 = [/\}/g, '-RCB-']
178
+
179
+ // Pads double dashes with spaces.
180
+ this.PAD_DOUBLE_DASHES = [/--/g, ' -- ']
181
+
182
+ // Adds spaces to start and end of string to simplify further regexps.
183
+ this.PAD_START_OF_STR = [/^/g, ' ']
184
+ this.PAD_END_OF_STR = [/$/g, ' ']
185
+
186
+ // Converts double quotes to two single quotes and pad with spaces.
187
+ this.CONVERT_DOUBLE_TO_SINGLE_QUOTES = [/"/g, " '' "]
188
+
189
+ // Handles single quote in possessives or close-single-quote.
190
+ this.HANDLES_SINGLE_QUOTES = [/([^'])' /g, "$1 ' "]
191
+
192
+ // Pad apostrophe in possessive or close-single-quote.
193
+ this.APOSTROPHE = [/([^'])'/, "$1 ' "]
194
+
195
+ // Prepend space on contraction apostrophe.
196
+ this.CONTRACTION_1 = [/'([sSmMdD]) /g, " '$1 "]
197
+ this.CONTRACTION_2 = [/'ll /g, " 'll "]
198
+ this.CONTRACTION_3 = [/'re /g, " 're "]
199
+ this.CONTRACTION_4 = [/'ve /g, " 've "]
200
+ this.CONTRACTION_5 = [/n't /g, " n't "]
201
+ this.CONTRACTION_6 = [/'LL /g, " 'LL "]
202
+ this.CONTRACTION_7 = [/'RE /g, " 'RE "]
203
+ this.CONTRACTION_8 = [/'VE /g, " 'VE "]
204
+ this.CONTRACTION_9 = [/N'T /g, " N'T "]
205
+
206
+ // Informal Contractions.
207
+ this.CONTRACTION_10 = [/ ([Cc])annot /g, ' $1an not ']
208
+ this.CONTRACTION_11 = [/ ([Dd])'ye /g, " $1' ye "]
209
+ this.CONTRACTION_12 = [/ ([Gg])imme /g, ' $1im me ']
210
+ this.CONTRACTION_13 = [/ ([Gg])onna /g, ' $1on na ']
211
+ this.CONTRACTION_14 = [/ ([Gg])otta /g, ' $1ot ta ']
212
+ this.CONTRACTION_15 = [/ ([Ll])emme /g, ' $1em me ']
213
+ this.CONTRACTION_16 = [/ ([Mm])ore'n /g, " $1ore 'n "]
214
+ this.CONTRACTION_17 = [/ '([Tt])is /g, " '$1 is "]
215
+ this.CONTRACTION_18 = [/ '([Tt])was /g, " '$1 was "]
216
+ this.CONTRACTION_19 = [/ ([Ww])anna /g, ' $1an na ']
217
+
218
+ // Clean out extra spaces
219
+ this.CLEAN_EXTRA_SPACE_1 = [/ */g, ' ']
220
+ this.CLEAN_EXTRA_SPACE_2 = [/^ */g, '']
221
+ this.CLEAN_EXTRA_SPACE_3 = [/ *$/g, '']
222
+
223
+ // Neurotic Perl regexes to escape special characters.
224
+ this.ESCAPE_AMPERSAND = [/&/g, '&amp;']
225
+ this.ESCAPE_PIPE = [/\|/g, '&#124;']
226
+ this.ESCAPE_LEFT_ANGLE_BRACKET = [/</g, '&lt;']
227
+ this.ESCAPE_RIGHT_ANGLE_BRACKET = [/>/g, '&gt;']
228
+ this.ESCAPE_SINGLE_QUOTE = [/'/g, '&apos;']
229
+ this.ESCAPE_DOUBLE_QUOTE = [/"/g, '&quot;']
230
+ this.ESCAPE_LEFT_SQUARE_BRACKET = [/\[/g, '&#91;']
231
+ this.ESCAPE_RIGHT_SQUARE_BRACKET = [/\]/g, '&#93;']
232
+
233
+ // English-specific patterns for handling contractions and possessives
234
+ this.EN_SPECIFIC_1 = [
235
+ new RegExp(
236
+ `([^${this._escapeRegExp(this.IsAlpha)}])[']([^${this._escapeRegExp(
237
+ this.IsAlpha
238
+ )}])`,
239
+ 'g'
240
+ ),
241
+ "$1 ' $2"
242
+ ]
243
+ this.EN_SPECIFIC_2 = [
244
+ new RegExp(
245
+ `([^${this._escapeRegExp(this.IsAlpha)}${this._escapeRegExp(
246
+ this.IsN
247
+ )}])[']([${this._escapeRegExp(this.IsAlpha)}])`,
248
+ 'g'
249
+ ),
250
+ "$1 ' $2"
251
+ ]
252
+ this.EN_SPECIFIC_3 = [
253
+ new RegExp(
254
+ `([${this._escapeRegExp(this.IsAlpha)}])[']([^${this._escapeRegExp(
255
+ this.IsAlpha
256
+ )}])`,
257
+ 'g'
258
+ ),
259
+ "$1 ' $2"
260
+ ]
261
+ this.EN_SPECIFIC_4 = [
262
+ new RegExp(
263
+ `([${this._escapeRegExp(this.IsAlpha)}])[']([${this._escapeRegExp(
264
+ this.IsAlpha
265
+ )}])`,
266
+ 'g'
267
+ ),
268
+ "$1 '$2"
269
+ ]
270
+ this.EN_SPECIFIC_5 = [
271
+ new RegExp(`([${this._escapeRegExp(this.IsN)}])[']([s])`, 'g'),
272
+ "$1 '$2"
273
+ ]
274
+
275
+ this.ENGLISH_SPECIFIC_APOSTROPHE = [
276
+ this.EN_SPECIFIC_1,
277
+ this.EN_SPECIFIC_2,
278
+ this.EN_SPECIFIC_3,
279
+ this.EN_SPECIFIC_4,
280
+ this.EN_SPECIFIC_5
281
+ ]
282
+
283
+ // French/Italian specific patterns
284
+ this.FR_IT_SPECIFIC_1 = [
285
+ new RegExp(
286
+ `([^${this._escapeRegExp(this.IsAlpha)}])[']([^${this._escapeRegExp(
287
+ this.IsAlpha
288
+ )}])`,
289
+ 'g'
290
+ ),
291
+ "$1 ' $2"
292
+ ]
293
+ this.FR_IT_SPECIFIC_2 = [
294
+ new RegExp(
295
+ `([^${this._escapeRegExp(this.IsAlpha)}])[']([${this._escapeRegExp(
296
+ this.IsAlpha
297
+ )}])`,
298
+ 'g'
299
+ ),
300
+ "$1 ' $2"
301
+ ]
302
+ this.FR_IT_SPECIFIC_3 = [
303
+ new RegExp(
304
+ `([${this._escapeRegExp(this.IsAlpha)}])[']([^${this._escapeRegExp(
305
+ this.IsAlpha
306
+ )}])`,
307
+ 'g'
308
+ ),
309
+ "$1 ' $2"
310
+ ]
311
+ this.FR_IT_SPECIFIC_4 = [
312
+ new RegExp(
313
+ `([${this._escapeRegExp(this.IsAlpha)}])[']([${this._escapeRegExp(
314
+ this.IsAlpha
315
+ )}])`,
316
+ 'g'
317
+ ),
318
+ "$1' $2"
319
+ ]
320
+
321
+ this.FR_IT_SPECIFIC_APOSTROPHE = [
322
+ this.FR_IT_SPECIFIC_1,
323
+ this.FR_IT_SPECIFIC_2,
324
+ this.FR_IT_SPECIFIC_3,
325
+ this.FR_IT_SPECIFIC_4
326
+ ]
327
+
328
+ this.NON_SPECIFIC_APOSTROPHE = [/'/g, " ' "]
329
+
330
+ this.TRAILING_DOT_APOSTROPHE = [/\.' ?$/g, " . ' "]
331
+
332
+ // Protected patterns
333
+ this.BASIC_PROTECTED_PATTERN_1 = /<\/?\S+\/?>/
334
+ this.BASIC_PROTECTED_PATTERN_2 = /<\S+( [a-zA-Z0-9]+="?[^"]*")+ ?\/?>/
335
+ this.BASIC_PROTECTED_PATTERN_3 = /<\S+( [a-zA-Z0-9]+='?[^']*')+ ?\/?>/
336
+ this.BASIC_PROTECTED_PATTERN_4 = /[\w\-_.]+@([\w\-_]+\.)+[a-zA-Z]{2,}/
337
+ this.BASIC_PROTECTED_PATTERN_5 =
338
+ /(https?|ftp):\/\/[^:/\s]+(\/\w+)*\/[\w\-.]+/
339
+
340
+ // Collected into an array for easy use
341
+ this.BASIC_PROTECTED_PATTERNS = [
342
+ this.BASIC_PROTECTED_PATTERN_1,
343
+ this.BASIC_PROTECTED_PATTERN_2,
344
+ this.BASIC_PROTECTED_PATTERN_3,
345
+ this.BASIC_PROTECTED_PATTERN_4,
346
+ this.BASIC_PROTECTED_PATTERN_5
347
+ ]
348
+
349
+ this.WEB_PROTECTED_PATTERNS = [
350
+ /((https?|ftp|rsync):\/\/|www\.)[^ ]*/, // URLs
351
+ /[\w\-_.]+@([\w\-_]+\.)+[a-zA-Z]{2,}/, // Emails
352
+ /@[a-zA-Z0-9_]+/, // @handler such as twitter/github ID
353
+ /#[a-zA-Z0-9_]+/ // @hashtag
354
+ ]
355
+
356
+ // Groups of regexes for different stages of tokenization
357
+ this.MOSES_PENN_REGEXES_1 = [
358
+ this.DEDUPLICATE_SPACE,
359
+ this.ASCII_JUNK,
360
+ this.DIRECTIONAL_QUOTE_1,
361
+ this.DIRECTIONAL_QUOTE_2,
362
+ this.DIRECTIONAL_QUOTE_3,
363
+ this.DIRECTIONAL_QUOTE_4,
364
+ this.DIRECTIONAL_QUOTE_5,
365
+ this.DIRECTIONAL_QUOTE_6,
366
+ this.DIRECTIONAL_QUOTE_7,
367
+ this.DIRECTIONAL_QUOTE_8,
368
+ this.REPLACE_ELLIPSIS,
369
+ this.COMMA_1,
370
+ this.COMMA_2,
371
+ this.COMMA_3,
372
+ this.SYMBOLS,
373
+ this.INTRATOKEN_SLASHES,
374
+ this.FINAL_PERIOD,
375
+ this.PAD_QUESTION_EXCLAMATION_MARK,
376
+ this.PAD_PARENTHESIS,
377
+ this.CONVERT_PARENTHESIS_1,
378
+ this.CONVERT_PARENTHESIS_2,
379
+ this.CONVERT_PARENTHESIS_3,
380
+ this.CONVERT_PARENTHESIS_4,
381
+ this.CONVERT_PARENTHESIS_5,
382
+ this.CONVERT_PARENTHESIS_6,
383
+ this.PAD_DOUBLE_DASHES,
384
+ this.PAD_START_OF_STR,
385
+ this.PAD_END_OF_STR,
386
+ this.CONVERT_DOUBLE_TO_SINGLE_QUOTES,
387
+ this.HANDLES_SINGLE_QUOTES,
388
+ this.APOSTROPHE,
389
+ this.CONTRACTION_1,
390
+ this.CONTRACTION_2,
391
+ this.CONTRACTION_3,
392
+ this.CONTRACTION_4,
393
+ this.CONTRACTION_5,
394
+ this.CONTRACTION_6,
395
+ this.CONTRACTION_7,
396
+ this.CONTRACTION_8,
397
+ this.CONTRACTION_9,
398
+ this.CONTRACTION_10,
399
+ this.CONTRACTION_11,
400
+ this.CONTRACTION_12,
401
+ this.CONTRACTION_13,
402
+ this.CONTRACTION_14,
403
+ this.CONTRACTION_15,
404
+ this.CONTRACTION_16,
405
+ this.CONTRACTION_17,
406
+ this.CONTRACTION_18,
407
+ this.CONTRACTION_19
408
+ ]
409
+
410
+ this.MOSES_PENN_REGEXES_2 = [
411
+ this.RESTORE_ELLIPSIS,
412
+ this.CLEAN_EXTRA_SPACE_1,
413
+ this.CLEAN_EXTRA_SPACE_2,
414
+ this.CLEAN_EXTRA_SPACE_3,
415
+ this.ESCAPE_AMPERSAND,
416
+ this.ESCAPE_PIPE,
417
+ this.ESCAPE_LEFT_ANGLE_BRACKET,
418
+ this.ESCAPE_RIGHT_ANGLE_BRACKET,
419
+ this.ESCAPE_SINGLE_QUOTE,
420
+ this.ESCAPE_DOUBLE_QUOTE
421
+ ]
422
+
423
+ this.MOSES_ESCAPE_XML_REGEXES = [
424
+ this.ESCAPE_AMPERSAND,
425
+ this.ESCAPE_PIPE,
426
+ this.ESCAPE_LEFT_ANGLE_BRACKET,
427
+ this.ESCAPE_RIGHT_ANGLE_BRACKET,
428
+ this.ESCAPE_SINGLE_QUOTE,
429
+ this.ESCAPE_DOUBLE_QUOTE,
430
+ this.ESCAPE_LEFT_SQUARE_BRACKET,
431
+ this.ESCAPE_RIGHT_SQUARE_BRACKET
432
+ ]
433
+
434
+ // Initialize the language specific nonbreaking prefixes.
435
+ this.NONBREAKING_PREFIXES = this.nonbreaking_prefixes
436
+ .getWordsAsArray(lang)
437
+ .map((nbp) => nbp.trim())
438
+
439
+ // Load custom nonbreaking prefixes file.
440
+ if (customNonbreakingPrefixesFile) {
441
+ // In a real implementation, this would load from a file
442
+ this.NONBREAKING_PREFIXES = []
443
+ // Code to read from file would go here
444
+ }
445
+
446
+ this.NUMERIC_ONLY_PREFIXES = this.NONBREAKING_PREFIXES.filter((w) =>
447
+ this.hasNumericOnly(w)
448
+ ).map((w) => w.split(' ')[0])
449
+
450
+ // Add CJK characters to alpha and alnum
451
+ if (['zh', 'ja', 'ko', 'cjk'].includes(this.lang)) {
452
+ let cjkChars = ''
453
+ if (['ko', 'cjk'].includes(this.lang)) {
454
+ cjkChars += this._joinFromGenerator(this.perluniprops.chars('Hangul'))
455
+ }
456
+ if (['zh', 'cjk'].includes(this.lang)) {
457
+ cjkChars += this._joinFromGenerator(this.perluniprops.chars('Han'))
458
+ }
459
+ if (['ja', 'cjk'].includes(this.lang)) {
460
+ cjkChars += this._joinFromGenerator(
461
+ this.perluniprops.chars('Hiragana')
462
+ )
463
+ cjkChars += this._joinFromGenerator(
464
+ this.perluniprops.chars('Katakana')
465
+ )
466
+ cjkChars += this._joinFromGenerator(this.perluniprops.chars('Han'))
467
+ }
468
+ this.IsAlpha += cjkChars
469
+ this.IsAlnum += cjkChars
470
+
471
+ // Overwrite the alnum regexes
472
+ this.PAD_NOT_ISALNUM = [
473
+ new RegExp(
474
+ `([^${this._escapeRegExp(this.IsAlnum)}\\s\\.'\`\\,\\-])`,
475
+ 'g'
476
+ ),
477
+ ' $1 '
478
+ ]
479
+ this.AGGRESSIVE_HYPHEN_SPLIT = [
480
+ new RegExp(
481
+ `([${this._escapeRegExp(this.IsAlnum)}])\\-(?=[${this._escapeRegExp(
482
+ this.IsAlnum
483
+ )}])`,
484
+ 'g'
485
+ ),
486
+ '$1 @-@ '
487
+ ]
488
+ this.INTRATOKEN_SLASHES = [
489
+ new RegExp(
490
+ `([${this._escapeRegExp(this.IsAlnum)}])\\/([${this._escapeRegExp(
491
+ this.IsAlnum
492
+ )}])`,
493
+ 'g'
494
+ ),
495
+ '$1 @/@ $2'
496
+ ]
497
+ }
498
+ }
499
+
500
+ /**
501
+ * Helper method to escape special characters in a string for regex
502
+ * @param {string} str - String to escape
503
+ * @returns {string} - Escaped string
504
+ * @private
505
+ */
506
+ _escapeRegExp (str) {
507
+ return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
508
+ }
509
+
510
+ /**
511
+ * Helper method to convert a generator to a string
512
+ * @param {Generator} generator - Generator to convert
513
+ * @returns {string} - Resulting string
514
+ * @private
515
+ */
516
+ _joinFromGenerator (generator) {
517
+ let result = ''
518
+ for (const char of generator) {
519
+ result += char
520
+ }
521
+ return result
522
+ }
523
+
524
+ /**
525
+ * Replaces multi-dots with placeholder text
526
+ * @param {string} text - Input text
527
+ * @returns {string} - Processed text
528
+ */
529
+ replaceMultidots (text) {
530
+ text = text.replace(/\.([.]+)/g, ' DOTMULTI$1')
531
+ const dotmulti = /DOTMULTI\./
532
+ while (dotmulti.test(text)) {
533
+ text = text.replace(/DOTMULTI\.([^.])/g, 'DOTDOTMULTI $1')
534
+ text = text.replace(dotmulti, 'DOTDOTMULTI')
535
+ }
536
+ return text
537
+ }
538
+
539
+ /**
540
+ * Restores multi-dots from placeholder text
541
+ * @param {string} text - Input text
542
+ * @returns {string} - Processed text
543
+ */
544
+ restoreMultidots (text) {
545
+ const dotmulti = /DOTDOTMULTI/
546
+ while (dotmulti.test(text)) {
547
+ text = text.replace(dotmulti, 'DOTMULTI.')
548
+ }
549
+ return text.replace(/DOTMULTI/g, '.')
550
+ }
551
+
552
+ /**
553
+ * Check if text contains only lowercase characters
554
+ * @param {string} text - Input text
555
+ * @returns {boolean} - True if all characters are lowercase
556
+ */
557
+ islower (text) {
558
+ for (let i = 0; i < text.length; i++) {
559
+ if (!this.IsLower.includes(text[i])) {
560
+ return false
561
+ }
562
+ }
563
+ return true
564
+ }
565
+
566
+ /**
567
+ * Check if text contains any alphabetic characters
568
+ * @param {string} text - Input text
569
+ * @returns {boolean} - True if any character is alphabetic
570
+ */
571
+ isanyalpha (text) {
572
+ for (let i = 0; i < text.length; i++) {
573
+ if (this.IsAlpha.includes(text[i])) {
574
+ return true
575
+ }
576
+ }
577
+ return false
578
+ }
579
+
580
+ /**
581
+ * Check if text contains numeric-only marker
582
+ * @param {string} text - Input text
583
+ * @returns {boolean} - True if text has a numeric-only marker
584
+ */
585
+ hasNumericOnly (text) {
586
+ return /[\s]+(#NUMERIC_ONLY#)/.test(text)
587
+ }
588
+
589
+ /**
590
+ * Handle nonbreaking prefixes in text
591
+ * @param {string} text - Input text
592
+ * @returns {string} - Processed text
593
+ */
594
+ handlesNonbreakingPrefixes (text) {
595
+ // Splits the text into tokens to check for nonbreaking prefixes
596
+ const tokens = text.split(/\s+/)
597
+ const numTokens = tokens.length
598
+
599
+ for (let i = 0; i < numTokens; i++) {
600
+ const token = tokens[i]
601
+ // Checks if token ends with a fullstop
602
+ const tokenEndsWithPeriod = /^(\S+)\.$/.exec(token)
603
+
604
+ if (tokenEndsWithPeriod) {
605
+ const prefix = tokenEndsWithPeriod[1]
606
+ // Check conditions for nonbreaking prefixes
607
+ if (
608
+ (prefix.includes('.') && this.isanyalpha(prefix)) ||
609
+ (this.NONBREAKING_PREFIXES.includes(prefix) &&
610
+ !this.NUMERIC_ONLY_PREFIXES.includes(prefix)) ||
611
+ (i !== numTokens - 1 &&
612
+ tokens[i + 1] &&
613
+ this.islower(tokens[i + 1][0]))
614
+ ) {
615
+ // No change to the token
616
+ } else if (
617
+ // Check if prefix is in NUMERIC_ONLY_PREFIXES and next token is a digit
618
+ this.NUMERIC_ONLY_PREFIXES.includes(prefix) &&
619
+ i + 1 < numTokens &&
620
+ /^[0-9]+/.test(tokens[i + 1])
621
+ ) {
622
+ // No change to the token
623
+ } else {
624
+ // Adds a space after the tokens before a dot
625
+ tokens[i] = prefix + ' .'
626
+ }
627
+ }
628
+ }
629
+
630
+ return tokens.join(' ') // Stitch the tokens back
631
+ }
632
+
633
+ /**
634
+ * Escape XML special characters in text
635
+ * @param {string} text - Input text
636
+ * @returns {string} - Processed text
637
+ */
638
+ escapeXml (text) {
639
+ for (const [regexp, substitution] of this.MOSES_ESCAPE_XML_REGEXES) {
640
+ text = text.replace(regexp, substitution)
641
+ }
642
+ return text
643
+ }
644
+
645
+ /**
646
+ * Penn Treebank tokenization
647
+ * @param {string} text - Input text
648
+ * @param {boolean} returnStr - Whether to return a string or array
649
+ * @returns {string|Array} - Tokenized text
650
+ */
651
+ pennTokenize (text, returnStr = false) {
652
+ // Converts input string into unicode
653
+ text = String(text)
654
+
655
+ // Perform a chain of regex substitutions using MOSES_PENN_REGEXES_1
656
+ for (const [regexp, substitution] of this.MOSES_PENN_REGEXES_1) {
657
+ text = text.replace(regexp, substitution)
658
+ }
659
+
660
+ // Handles nonbreaking prefixes
661
+ text = this.handlesNonbreakingPrefixes(text)
662
+
663
+ // Restore ellipsis, clean extra spaces, escape XML symbols
664
+ for (const [regexp, substitution] of this.MOSES_PENN_REGEXES_2) {
665
+ text = text.replace(regexp, substitution)
666
+ }
667
+
668
+ return returnStr ? text : text.split(/\s+/).filter((t) => t.length > 0)
669
+ }
670
+
671
+ /**
672
+ * Main tokenization method
673
+ * @param {string} text - Input text
674
+ * @param {boolean} aggressiveDashSplits - Whether to aggressively split dashes
675
+ * @param {boolean} returnStr - Whether to return a string or array
676
+ * @param {boolean} escape - Whether to escape XML
677
+ * @param {Array} protectedPatterns - Patterns to protect from tokenization
678
+ * @returns {string|Array} - Tokenized text
679
+ */
680
+ tokenize (
681
+ text,
682
+ aggressiveDashSplits = false,
683
+ returnStr = false,
684
+ escape = true,
685
+ protectedPatterns = null
686
+ ) {
687
+ // Converts input string into unicode
688
+ text = String(text)
689
+
690
+ // De-duplicate spaces and clean ASCII junk
691
+ for (const [regexp, substitution] of [
692
+ this.DEDUPLICATE_SPACE,
693
+ this.ASCII_JUNK
694
+ ]) {
695
+ text = text.replace(regexp, substitution)
696
+ }
697
+
698
+ // Initialize protectedTokens array HERE (properly scoped)
699
+ const protectedTokens = []
700
+
701
+ // Process protected patterns
702
+ if (protectedPatterns) {
703
+ try {
704
+ // Compile all patterns with global and case insensitivity flags
705
+ const compiledPatterns = protectedPatterns.map((p) =>
706
+ p instanceof RegExp
707
+ ? new RegExp(
708
+ p.source,
709
+ p.flags.includes('g') ? p.flags : p.flags + 'g'
710
+ )
711
+ : new RegExp(p, 'gi')
712
+ )
713
+
714
+ // Find all matches across all patterns
715
+ compiledPatterns.forEach((pattern) => {
716
+ // Reset lastIndex to start from beginning
717
+ pattern.lastIndex = 0
718
+
719
+ // Find all matches for this pattern
720
+ let match
721
+ while ((match = pattern.exec(text)) !== null) {
722
+ if (match[0].length > 0) {
723
+ // Skip empty matches
724
+ protectedTokens.push(match[0])
725
+ }
726
+
727
+ // Avoid infinite loops for zero-width matches
728
+ if (match.index === pattern.lastIndex) {
729
+ pattern.lastIndex++
730
+ }
731
+ }
732
+ })
733
+
734
+ // Ensure we don't exceed 1000 matches (3-digit limit)
735
+ if (protectedTokens.length > 1000) {
736
+ console.warn(
737
+ `More than 1000 protected tokens found (${protectedTokens.length}). Using only the first 1000.`
738
+ )
739
+ protectedTokens.length = 1000 // Truncate to 1000
740
+ }
741
+
742
+ // Sort by length (longest first) to prevent substring replacements
743
+ const sortedTokenWithIndices = [...protectedTokens].map((token, i) => ({
744
+ token,
745
+ index: i
746
+ }))
747
+ sortedTokenWithIndices.sort((a, b) => b.token.length - a.token.length)
748
+
749
+ // Apply replacements from longest to shortest
750
+ for (const { token, index } of sortedTokenWithIndices) {
751
+ const substitution =
752
+ 'THISISPROTECTED' + String(index).padStart(3, '0')
753
+
754
+ // Use split and join to replace all occurrences
755
+ text = text.split(token).join(substitution)
756
+ }
757
+ } catch (e) {
758
+ console.error('Error processing protected patterns:', e)
759
+ // Continue without protected pattern processing
760
+ }
761
+ }
762
+
763
+ // Strips heading and trailing spaces
764
+ text = text.trim()
765
+
766
+ // Separate special characters outside of IsAlnum character set
767
+ const [regexpNotAlnum, substitutionNotAlnum] = this.PAD_NOT_ISALNUM
768
+ text = text.replace(regexpNotAlnum, substitutionNotAlnum)
769
+
770
+ // Aggressively splits dashes
771
+ if (aggressiveDashSplits) {
772
+ const [regexpHyphen, substitutionHyphen] = this.AGGRESSIVE_HYPHEN_SPLIT
773
+ text = text.replace(regexpHyphen, substitutionHyphen)
774
+ }
775
+
776
+ // Replaces multidots with "DOTDOTMULTI" literal strings
777
+ text = this.replaceMultidots(text)
778
+
779
+ // Separate out "," except if within numbers e.g. 5,300
780
+ for (const [regexp, substitution] of [
781
+ this.COMMA_SEPARATE_1,
782
+ this.COMMA_SEPARATE_2,
783
+ this.COMMA_SEPARATE_3
784
+ ]) {
785
+ text = text.replace(regexp, substitution)
786
+ }
787
+
788
+ // Language-specific apostrophe tokenization
789
+ if (this.lang === 'en') {
790
+ for (const [regexp, substitution] of this.ENGLISH_SPECIFIC_APOSTROPHE) {
791
+ text = text.replace(regexp, substitution)
792
+ }
793
+ } else if (this.lang === 'fr' || this.lang === 'it') {
794
+ for (const [regexp, substitution] of this.FR_IT_SPECIFIC_APOSTROPHE) {
795
+ text = text.replace(regexp, substitution)
796
+ }
797
+ } else {
798
+ const [regexp, substitution] = this.NON_SPECIFIC_APOSTROPHE
799
+ text = text.replace(regexp, substitution)
800
+ }
801
+
802
+ // Handles nonbreaking prefixes
803
+ text = this.handlesNonbreakingPrefixes(text)
804
+
805
+ // Cleans up extraneous spaces
806
+ const [regexpSpace, substitutionSpace] = this.DEDUPLICATE_SPACE
807
+ text = text.replace(regexpSpace, substitutionSpace).trim()
808
+
809
+ // Split trailing ".'".
810
+ const [regexpDotApostrophe, substitutionDotApostrophe] =
811
+ this.TRAILING_DOT_APOSTROPHE
812
+ text = text.replace(regexpDotApostrophe, substitutionDotApostrophe)
813
+
814
+ // Restore the protected tokens
815
+ if (protectedPatterns && protectedTokens.length > 0) {
816
+ // Process from 0 to length (the indices are embedded in the substitution strings)
817
+ for (let i = 0; i < protectedTokens.length; i++) {
818
+ const substitution = 'THISISPROTECTED' + String(i).padStart(3, '0')
819
+ const token = protectedTokens[i]
820
+ text = text.split(substitution).join(token)
821
+ }
822
+ }
823
+
824
+ // Restore multidots
825
+ text = this.restoreMultidots(text)
826
+
827
+ if (escape) {
828
+ // Escape XML symbols
829
+ text = this.escapeXml(text)
830
+ }
831
+
832
+ return returnStr ? text : text.split(/\s+/).filter((t) => t.length > 0)
833
+ }
834
+ }
835
+
836
+ /**
837
+ * MosesDetokenizer class for detokenizing text in various languages
838
+ */
839
+ class MosesDetokenizer {
840
+ /**
841
+ * Initialize a new Moses Detokenizer
842
+ * @param {string} lang - Language code (default: "en")
843
+ */
844
+ constructor (lang = 'en') {
845
+ this.lang = lang
846
+
847
+ // Initialize Perluniprops - choose implementation based on environment
848
+ this.perluniprops = new Perluniprops()
849
+
850
+ // Character sets from Perluniprops - convert generators to strings for regex use
851
+ this.IsAlnum = this._joinFromGenerator(this.perluniprops.chars('IsAlnum'))
852
+ this.IsAlpha = this._joinFromGenerator(this.perluniprops.chars('IsAlpha'))
853
+ this.IsSc = this._joinFromGenerator(this.perluniprops.chars('IsSc'))
854
+
855
+ // Regex patterns with their replacements
856
+ this.AGGRESSIVE_HYPHEN_SPLIT = [/ @-@ /g, '-']
857
+
858
+ // Merge multiple spaces
859
+ this.ONE_SPACE = [/ {2,}/g, ' ']
860
+
861
+ // Unescape special characters
862
+ this.UNESCAPE_FACTOR_SEPARATOR = [/&#124;/g, '|']
863
+ this.UNESCAPE_LEFT_ANGLE_BRACKET = [/&lt;/g, '<']
864
+ this.UNESCAPE_RIGHT_ANGLE_BRACKET = [/&gt;/g, '>']
865
+ this.UNESCAPE_DOUBLE_QUOTE = [/&quot;/g, '"']
866
+ this.UNESCAPE_SINGLE_QUOTE = [/&apos;/g, "'"]
867
+ this.UNESCAPE_SYNTAX_NONTERMINAL_LEFT = [/&#91;/g, '[']
868
+ this.UNESCAPE_SYNTAX_NONTERMINAL_RIGHT = [/&#93;/g, ']']
869
+ this.UNESCAPE_AMPERSAND = [/&amp;/g, '&']
870
+
871
+ // Legacy regexes for older Moses versions
872
+ this.UNESCAPE_FACTOR_SEPARATOR_LEGACY = [/&bar;/g, '|']
873
+ this.UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY = [/&bra;/g, '[']
874
+ this.UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY = [/&ket;/g, ']']
875
+
876
+ // Group all XML unescape regexes
877
+ this.MOSES_UNESCAPE_XML_REGEXES = [
878
+ this.UNESCAPE_FACTOR_SEPARATOR_LEGACY,
879
+ this.UNESCAPE_FACTOR_SEPARATOR,
880
+ this.UNESCAPE_LEFT_ANGLE_BRACKET,
881
+ this.UNESCAPE_RIGHT_ANGLE_BRACKET,
882
+ this.UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY,
883
+ this.UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY,
884
+ this.UNESCAPE_DOUBLE_QUOTE,
885
+ this.UNESCAPE_SINGLE_QUOTE,
886
+ this.UNESCAPE_SYNTAX_NONTERMINAL_LEFT,
887
+ this.UNESCAPE_SYNTAX_NONTERMINAL_RIGHT,
888
+ this.UNESCAPE_AMPERSAND
889
+ ]
890
+
891
+ // Finnish morphological rules
892
+ this.FINNISH_MORPHSET_1 = [
893
+ 'N',
894
+ 'n',
895
+ 'A',
896
+ 'a',
897
+ 'Ä',
898
+ 'ä',
899
+ 'ssa',
900
+ 'Ssa',
901
+ 'ssä',
902
+ 'Ssä',
903
+ 'sta',
904
+ 'stä',
905
+ 'Sta',
906
+ 'Stä',
907
+ 'hun',
908
+ 'Hun',
909
+ 'hyn',
910
+ 'Hyn',
911
+ 'han',
912
+ 'Han',
913
+ 'hän',
914
+ 'Hän',
915
+ 'hön',
916
+ 'Hön',
917
+ 'un',
918
+ 'Un',
919
+ 'yn',
920
+ 'Yn',
921
+ 'an',
922
+ 'An',
923
+ 'än',
924
+ 'Än',
925
+ 'ön',
926
+ 'Ön',
927
+ 'seen',
928
+ 'Seen',
929
+ 'lla',
930
+ 'Lla',
931
+ 'llä',
932
+ 'Llä',
933
+ 'lta',
934
+ 'Lta',
935
+ 'ltä',
936
+ 'Ltä',
937
+ 'lle',
938
+ 'Lle',
939
+ 'ksi',
940
+ 'Ksi',
941
+ 'kse',
942
+ 'Kse',
943
+ 'tta',
944
+ 'Tta',
945
+ 'ine',
946
+ 'Ine'
947
+ ]
948
+
949
+ this.FINNISH_MORPHSET_2 = ['ni', 'si', 'mme', 'nne', 'nsa']
950
+
951
+ this.FINNISH_MORPHSET_3 = [
952
+ 'ko',
953
+ 'kö',
954
+ 'han',
955
+ 'hän',
956
+ 'pa',
957
+ 'pä',
958
+ 'kaan',
959
+ 'kään',
960
+ 'kin'
961
+ ]
962
+
963
+ // Combine Finnish morphsets into a regex pattern
964
+ this.FINNISH_REGEX = new RegExp(
965
+ `^(${this.FINNISH_MORPHSET_1.join('|')})(${this.FINNISH_MORPHSET_2.join(
966
+ '|'
967
+ )})?(${this.FINNISH_MORPHSET_3.join('|')})$`
968
+ )
969
+
970
+ // Other regex patterns for text processing
971
+ this.IS_CURRENCY_SYMBOL = new RegExp(
972
+ `^[${this._escapeRegExp(this.IsSc)}\\(\\[\\{\\¿\\¡]+$`
973
+ )
974
+ this.IS_ENGLISH_CONTRACTION = new RegExp(
975
+ `^['][${this._escapeRegExp(this.IsAlpha)}]`
976
+ )
977
+ this.IS_FRENCH_CONRTACTION = new RegExp(
978
+ `[${this._escapeRegExp(this.IsAlpha)}][']$`
979
+ )
980
+ this.STARTS_WITH_ALPHA = new RegExp(
981
+ `^[${this._escapeRegExp(this.IsAlpha)}]`
982
+ )
983
+ // eslint-disable-next-line no-useless-escape
984
+ this.IS_PUNCT = /^[\,\.\?\!\:\;\\\%\}\]\)]+$/
985
+ // eslint-disable-next-line no-useless-escape
986
+ this.IS_OPEN_QUOTE = /^[\'\"\„\"\`]+$/
987
+ }
988
+
989
+ /**
990
+ * Helper method to escape special characters in a string for regex
991
+ * @param {string} str - String to escape
992
+ * @returns {string} - Escaped string
993
+ * @private
994
+ */
995
+ _escapeRegExp (str) {
996
+ return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
997
+ }
998
+
999
+ /**
1000
+ * Helper method to convert a generator to a string
1001
+ * @param {Generator} generator - Generator to convert
1002
+ * @returns {string} - Resulting string
1003
+ * @private
1004
+ */
1005
+ _joinFromGenerator (generator) {
1006
+ let result = ''
1007
+ for (const char of generator) {
1008
+ result += char
1009
+ }
1010
+ return result
1011
+ }
1012
+
1013
+ /**
1014
+ * Unescape XML-escaped characters in text
1015
+ * @param {string} text - Input text
1016
+ * @returns {string} - Processed text
1017
+ */
1018
+ unescapeXml (text) {
1019
+ for (const [regexp, substitution] of this.MOSES_UNESCAPE_XML_REGEXES) {
1020
+ text = text.replace(regexp, substitution)
1021
+ }
1022
+ return text
1023
+ }
1024
+
1025
+ /**
1026
+ * Main detokenization method (named tokenize for compatibility with Python original)
1027
+ * @param {Array} tokens - Array of tokens to detokenize
1028
+ * @param {boolean} returnStr - Whether to return a string or array
1029
+ * @param {boolean} unescape - Whether to unescape XML
1030
+ * @returns {string|Array} - Detokenized text
1031
+ */
1032
+ tokenize (tokens, returnStr = true, unescape = true) {
1033
+ // Convert the list of tokens into a string and pad it with spaces
1034
+ let text = ` ${tokens.join(' ')} `
1035
+
1036
+ // Detokenize the aggressive hyphen split
1037
+ const [regexpHyphen, substitutionHyphen] = this.AGGRESSIVE_HYPHEN_SPLIT
1038
+ text = text.replace(regexpHyphen, substitutionHyphen)
1039
+
1040
+ if (unescape) {
1041
+ // Unescape the XML symbols
1042
+ text = this.unescapeXml(text)
1043
+ }
1044
+
1045
+ // Keep track of quotation marks
1046
+ const quoteCounts = { "'": 0, '"': 0, '``': 0, '`': 0, "''": 0 }
1047
+
1048
+ // The prependSpace variable controls the "effects" of detokenization
1049
+ // as we loop through the tokens
1050
+ let prependSpace = ' '
1051
+ let detokenizedText = ''
1052
+
1053
+ // Split the text into tokens for processing
1054
+ const tokenArray = text.split(/\s+/).filter((t) => t.length > 0)
1055
+
1056
+ // Iterate through every token and apply language specific detokenization rules
1057
+ for (let i = 0; i < tokenArray.length; i++) {
1058
+ const token = tokenArray[i]
1059
+
1060
+ // Skip empty tokens
1061
+ if (!token) continue
1062
+
1063
+ // Check if the first char is CJK
1064
+ if (token[0] && isCJK(token[0]) && this.lang !== 'ko') {
1065
+ // Perform left shift if this is a second consecutive CJK word
1066
+ if (
1067
+ i > 0 &&
1068
+ tokenArray[i - 1] &&
1069
+ tokenArray[i - 1].length > 0 &&
1070
+ isCJK(tokenArray[i - 1][tokenArray[i - 1].length - 1])
1071
+ ) {
1072
+ detokenizedText += token
1073
+ } else {
1074
+ // Nothing special if this is a CJK word that doesn't follow a CJK word
1075
+ detokenizedText += prependSpace + token
1076
+ }
1077
+ prependSpace = ' '
1078
+ } else if (this.IS_CURRENCY_SYMBOL.test(token)) {
1079
+ // If it's a currency symbol
1080
+ // Perform right shift on currency and other random punctuation items
1081
+ detokenizedText += prependSpace + token
1082
+ prependSpace = ''
1083
+ } else if (this.IS_PUNCT.test(token)) {
1084
+ // If it's a punctuation
1085
+ // In French, these punctuations are prefixed with a non-breakable space
1086
+ if (this.lang === 'fr' && /^[?!:;\\%]$/.test(token)) {
1087
+ detokenizedText += ' '
1088
+ }
1089
+ // Perform left shift on punctuation items
1090
+ detokenizedText += token
1091
+ prependSpace = ' '
1092
+ } else if (
1093
+ this.lang === 'en' &&
1094
+ i > 0 &&
1095
+ this.IS_ENGLISH_CONTRACTION.test(token)
1096
+ ) {
1097
+ // English contractions
1098
+ // For English, left-shift the contraction
1099
+ detokenizedText += token
1100
+ prependSpace = ' '
1101
+ } else if (
1102
+ this.lang === 'cs' &&
1103
+ i > 1 &&
1104
+ /^[0-9]+$/.test(tokenArray[i - 2]) && // Previous previous token is a number
1105
+ /^[.,]$/.test(tokenArray[i - 1]) && // Previous token is a dot/comma
1106
+ /^[0-9]+$/.test(token) // Current token is a number
1107
+ ) {
1108
+ // Czech decimal numbers
1109
+ // In Czech, left-shift floats that are decimal numbers
1110
+ detokenizedText += token
1111
+ prependSpace = ' '
1112
+ } else if (
1113
+ ['fr', 'it', 'ga'].includes(this.lang) &&
1114
+ i < tokenArray.length - 1 &&
1115
+ this.IS_FRENCH_CONRTACTION.test(token) &&
1116
+ this.STARTS_WITH_ALPHA.test(tokenArray[i + 1])
1117
+ ) {
1118
+ // French/Italian/Gaelic contractions
1119
+ // For French and Italian, right-shift the contraction
1120
+ detokenizedText += prependSpace + token
1121
+ prependSpace = ''
1122
+ } else if (
1123
+ this.lang === 'cs' &&
1124
+ i < tokenArray.length - 2 &&
1125
+ this.IS_FRENCH_CONRTACTION.test(token) &&
1126
+ /^[-–]$/.test(tokenArray[i + 1]) &&
1127
+ /^li$|^mail.*/i.test(tokenArray[i + 2])
1128
+ ) {
1129
+ // Czech e-mail and -li words
1130
+ // In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail)
1131
+ detokenizedText += prependSpace + token + tokenArray[i + 1]
1132
+ i++ // Skip the dash token
1133
+ prependSpace = ''
1134
+ } else if (this.IS_OPEN_QUOTE.test(token)) {
1135
+ // Quote handling
1136
+ let normalizedQuo = token
1137
+ if (/^[„""]/.test(token)) {
1138
+ normalizedQuo = '"'
1139
+ }
1140
+
1141
+ // Initialize quote count if not present
1142
+ quoteCounts[normalizedQuo] = quoteCounts[normalizedQuo] || 0
1143
+
1144
+ // Special handling for Czech quotes
1145
+ if (this.lang === 'cs' && token === '„') {
1146
+ quoteCounts[normalizedQuo] = 0
1147
+ }
1148
+ if (this.lang === 'cs' && token === '"') {
1149
+ quoteCounts[normalizedQuo] = 1
1150
+ }
1151
+ // Even count of quotes (opening quote)
1152
+ if (quoteCounts[normalizedQuo] % 2 === 0) {
1153
+ // Special case for English possessives ending in 's
1154
+ if (
1155
+ this.lang === 'en' &&
1156
+ token === "'" &&
1157
+ i > 0 &&
1158
+ /[s]$/.test(tokenArray[i - 1])
1159
+ ) {
1160
+ // Left shift on single quote for possessives ending in "s"
1161
+ detokenizedText += token
1162
+ prependSpace = ' '
1163
+ } else {
1164
+ // Right shift for opening quotes
1165
+ detokenizedText += prependSpace + token
1166
+ prependSpace = ''
1167
+ quoteCounts[normalizedQuo]++
1168
+ }
1169
+ } else {
1170
+ // Left shift for closing quotes
1171
+ detokenizedText += token
1172
+ prependSpace = ' '
1173
+ quoteCounts[normalizedQuo]++
1174
+ }
1175
+ } else if (
1176
+ this.lang === 'fi' &&
1177
+ i > 0 &&
1178
+ /:$/.test(tokenArray[i - 1]) &&
1179
+ this.FINNISH_REGEX.test(token)
1180
+ ) {
1181
+ // Finnish case suffixes
1182
+ // Finnish : without intervening space if followed by case suffix
1183
+ detokenizedText += prependSpace + token
1184
+ prependSpace = ' '
1185
+ } else {
1186
+ // Default case - just add the token with appropriate spacing
1187
+ detokenizedText += prependSpace + token
1188
+ prependSpace = ' '
1189
+ }
1190
+ }
1191
+
1192
+ // Merge multiple spaces
1193
+ const [regexpSpace, substitutionSpace] = this.ONE_SPACE
1194
+ detokenizedText = detokenizedText.replace(regexpSpace, substitutionSpace)
1195
+
1196
+ // Remove heading and trailing spaces
1197
+ detokenizedText = detokenizedText.trim()
1198
+
1199
+ return returnStr ? detokenizedText : detokenizedText.split(/\s+/)
1200
+ }
1201
+
1202
+ /**
1203
+ * Alias for tokenize to match the original Python API
1204
+ * @param {Array} tokens - Array of tokens to detokenize
1205
+ * @param {boolean} returnStr - Whether to return a string or array
1206
+ * @param {boolean} unescape - Whether to unescape XML
1207
+ * @returns {string|Array} - Detokenized text
1208
+ */
1209
+ detokenize (tokens, returnStr = true, unescape = true) {
1210
+ return this.tokenize(tokens, returnStr, unescape)
1211
+ }
1212
+ }
1213
+
1214
+ module.exports = {
1215
+ MosesTokenizer,
1216
+ MosesDetokenizer
1217
+ }