@qvac/translation-nmtcpp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +470 -0
- package/binding.js +1 -0
- package/index.d.ts +82 -0
- package/index.js +188 -0
- package/lib/error.js +65 -0
- package/marian.js +186 -0
- package/package.json +69 -0
- package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
- package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
- package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
- package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
- package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
- package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
- package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
- package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
- package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
- package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
- package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
- package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
- package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
- package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
- package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
- package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
- package/third-party/indic-processor.js +565 -0
|
@@ -0,0 +1,1217 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JavaScript port of the Moses Tokenizer from
|
|
3
|
+
* https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
const { Perluniprops, NonbreakingPrefixes } = require('./pernuliprops')
|
|
7
|
+
const { VIRAMAS, NUKTAS } = require('./indic')
|
|
8
|
+
const { isCJK } = require('./cjk')
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* MosesTokenizer class for tokenizing text in various languages
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* JavaScript port of the Moses Tokenizer from
|
|
15
|
+
* https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
class MosesTokenizer {
|
|
19
|
+
/**
|
|
20
|
+
* Initialize a new Moses Tokenizer
|
|
21
|
+
* @param {string} lang - Language code (default: "en")
|
|
22
|
+
* @param {string|null} customNonbreakingPrefixesFile - Path to custom prefixes file
|
|
23
|
+
*/
|
|
24
|
+
constructor (lang = 'en', customNonbreakingPrefixesFile = null) {
|
|
25
|
+
this.lang = lang
|
|
26
|
+
|
|
27
|
+
// Initialize Perluniprops and NonbreakingPrefixes
|
|
28
|
+
this.perluniprops = new Perluniprops()
|
|
29
|
+
this.nonbreaking_prefixes = new NonbreakingPrefixes()
|
|
30
|
+
|
|
31
|
+
// Perl Unicode Properties character sets.
|
|
32
|
+
// Note: In JavaScript we'll convert the generator to arrays/strings for regex use
|
|
33
|
+
this.IsN = this._joinFromGenerator(this.perluniprops.chars('IsN'))
|
|
34
|
+
|
|
35
|
+
// Build IsAlnum with VIRAMAS and NUKTAS
|
|
36
|
+
const alnumChars = this._joinFromGenerator(
|
|
37
|
+
this.perluniprops.chars('IsAlnum')
|
|
38
|
+
)
|
|
39
|
+
this.IsAlnum = alnumChars + VIRAMAS + NUKTAS
|
|
40
|
+
|
|
41
|
+
this.IsSc = this._joinFromGenerator(this.perluniprops.chars('IsSc'))
|
|
42
|
+
this.IsSo = this._joinFromGenerator(this.perluniprops.chars('IsSo'))
|
|
43
|
+
|
|
44
|
+
// Build IsAlpha with VIRAMAS and NUKTAS
|
|
45
|
+
const alphaChars = this._joinFromGenerator(
|
|
46
|
+
this.perluniprops.chars('IsAlpha')
|
|
47
|
+
)
|
|
48
|
+
this.IsAlpha = alphaChars + VIRAMAS + NUKTAS
|
|
49
|
+
|
|
50
|
+
this.IsLower = this._joinFromGenerator(this.perluniprops.chars('IsLower'))
|
|
51
|
+
|
|
52
|
+
// Remove ASCII junk.
|
|
53
|
+
this.DEDUPLICATE_SPACE = [/\s+/g, ' ']
|
|
54
|
+
// eslint-disable-next-line no-control-regex
|
|
55
|
+
this.ASCII_JUNK = [/[\u0000-\u001F]/g, '']
|
|
56
|
+
|
|
57
|
+
// Pad all "other" special characters not in IsAlnum.
|
|
58
|
+
this.PAD_NOT_ISALNUM = [
|
|
59
|
+
new RegExp(
|
|
60
|
+
`([^${this._escapeRegExp(this.IsAlnum)}\\s\\.'\`\\,\\-])`,
|
|
61
|
+
'g'
|
|
62
|
+
),
|
|
63
|
+
' $1 '
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
// Splits all hyphens (regardless of circumstances), e.g. 'foo-bar' -> 'foo @-@ bar'
|
|
67
|
+
this.AGGRESSIVE_HYPHEN_SPLIT = [
|
|
68
|
+
new RegExp(
|
|
69
|
+
`([${this._escapeRegExp(this.IsAlnum)}])\\-(?=[${this._escapeRegExp(
|
|
70
|
+
this.IsAlnum
|
|
71
|
+
)}])`,
|
|
72
|
+
'g'
|
|
73
|
+
),
|
|
74
|
+
'$1 @-@ '
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
// Make multi-dots stay together.
|
|
78
|
+
this.REPLACE_DOT_WITH_LITERALSTRING_1 = [/.([.]+)/g, ' DOTMULTI$1']
|
|
79
|
+
this.REPLACE_DOT_WITH_LITERALSTRING_2 = [
|
|
80
|
+
/DOTMULTI\.([^.])/,
|
|
81
|
+
'DOTDOTMULTI $1'
|
|
82
|
+
]
|
|
83
|
+
this.REPLACE_DOT_WITH_LITERALSTRING_3 = [/DOTMULTI\./g, 'DOTDOTMULTI']
|
|
84
|
+
|
|
85
|
+
// Separate out "," except if within numbers (5,300)
|
|
86
|
+
this.COMMA_SEPARATE_1 = [
|
|
87
|
+
new RegExp(`([^${this._escapeRegExp(this.IsN)}])[,]`, 'g'),
|
|
88
|
+
'$1 , '
|
|
89
|
+
]
|
|
90
|
+
this.COMMA_SEPARATE_2 = [
|
|
91
|
+
new RegExp(`[,]([^${this._escapeRegExp(this.IsN)}])`, 'g'),
|
|
92
|
+
' , $1'
|
|
93
|
+
]
|
|
94
|
+
this.COMMA_SEPARATE_3 = [
|
|
95
|
+
new RegExp(`([${this._escapeRegExp(this.IsN)}])[,]$`, 'g'),
|
|
96
|
+
'$1 , '
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
// Attempt to get correct directional quotes.
|
|
100
|
+
this.DIRECTIONAL_QUOTE_1 = [/^``/g, '`` ']
|
|
101
|
+
this.DIRECTIONAL_QUOTE_2 = [/^"/g, '`` ']
|
|
102
|
+
this.DIRECTIONAL_QUOTE_3 = [/^`([^`])/g, '` $1']
|
|
103
|
+
this.DIRECTIONAL_QUOTE_4 = [/^'/g, '` ']
|
|
104
|
+
this.DIRECTIONAL_QUOTE_5 = [/([ ([{<])"/g, '$1 `` ']
|
|
105
|
+
this.DIRECTIONAL_QUOTE_6 = [/([ ([{<])``/g, '$1 `` ']
|
|
106
|
+
this.DIRECTIONAL_QUOTE_7 = [/([ ([{<])`([^`])/g, '$1 ` $2']
|
|
107
|
+
this.DIRECTIONAL_QUOTE_8 = [/([ ([{<])'/g, '$1 ` ']
|
|
108
|
+
|
|
109
|
+
// Replace ... with _ELLIPSIS_ and later restore
|
|
110
|
+
this.REPLACE_ELLIPSIS = [/\.\.\./g, ' _ELLIPSIS_ ']
|
|
111
|
+
this.RESTORE_ELLIPSIS = [/_ELLIPSIS_/g, '...']
|
|
112
|
+
|
|
113
|
+
// Pad , with tailing space except if within numbers, e.g. 5,300
|
|
114
|
+
this.COMMA_1 = [
|
|
115
|
+
new RegExp(
|
|
116
|
+
`([^${this._escapeRegExp(this.IsN)}])[,]([^${this._escapeRegExp(
|
|
117
|
+
this.IsN
|
|
118
|
+
)}])`,
|
|
119
|
+
'g'
|
|
120
|
+
),
|
|
121
|
+
'$1 , $2'
|
|
122
|
+
]
|
|
123
|
+
this.COMMA_2 = [
|
|
124
|
+
new RegExp(
|
|
125
|
+
`([${this._escapeRegExp(this.IsN)}])[,]([^${this._escapeRegExp(
|
|
126
|
+
this.IsN
|
|
127
|
+
)}])`,
|
|
128
|
+
'g'
|
|
129
|
+
),
|
|
130
|
+
'$1 , $2'
|
|
131
|
+
]
|
|
132
|
+
this.COMMA_3 = [
|
|
133
|
+
new RegExp(
|
|
134
|
+
`([^${this._escapeRegExp(this.IsN)}])[,]([${this._escapeRegExp(
|
|
135
|
+
this.IsN
|
|
136
|
+
)}])`,
|
|
137
|
+
'g'
|
|
138
|
+
),
|
|
139
|
+
'$1 , $2'
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
// Pad unicode symbols with spaces.
|
|
143
|
+
this.SYMBOLS = [
|
|
144
|
+
new RegExp(
|
|
145
|
+
`([;:@#\\$%&${this._escapeRegExp(this.IsSc)}${this._escapeRegExp(
|
|
146
|
+
this.IsSo
|
|
147
|
+
)}])`,
|
|
148
|
+
'g'
|
|
149
|
+
),
|
|
150
|
+
' $1 '
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
// Separate out intra-token slashes.
|
|
154
|
+
this.INTRATOKEN_SLASHES = [
|
|
155
|
+
new RegExp(
|
|
156
|
+
`([${this._escapeRegExp(this.IsAlnum)}])\\/([${this._escapeRegExp(
|
|
157
|
+
this.IsAlnum
|
|
158
|
+
)}])`,
|
|
159
|
+
'g'
|
|
160
|
+
),
|
|
161
|
+
'$1 @/@ $2'
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
// Splits final period at end of string.
|
|
165
|
+
this.FINAL_PERIOD = [/([^.])([.])([\\]\)}>"']*) ?$/g, '$1 $2$3']
|
|
166
|
+
|
|
167
|
+
// Pad all question marks and exclamation marks with spaces.
|
|
168
|
+
this.PAD_QUESTION_EXCLAMATION_MARK = [/([?!])/g, ' $1 ']
|
|
169
|
+
|
|
170
|
+
// Handles parentheses, brackets and converts them to PTB symbols.
|
|
171
|
+
this.PAD_PARENTHESIS = [/([\][(){}<>])/g, ' $1 ']
|
|
172
|
+
this.CONVERT_PARENTHESIS_1 = [/\(/g, '-LRB-']
|
|
173
|
+
this.CONVERT_PARENTHESIS_2 = [/\)/g, '-RRB-']
|
|
174
|
+
this.CONVERT_PARENTHESIS_3 = [/\[/g, '-LSB-']
|
|
175
|
+
this.CONVERT_PARENTHESIS_4 = [/\]/g, '-RSB-']
|
|
176
|
+
this.CONVERT_PARENTHESIS_5 = [/\{/g, '-LCB-']
|
|
177
|
+
this.CONVERT_PARENTHESIS_6 = [/\}/g, '-RCB-']
|
|
178
|
+
|
|
179
|
+
// Pads double dashes with spaces.
|
|
180
|
+
this.PAD_DOUBLE_DASHES = [/--/g, ' -- ']
|
|
181
|
+
|
|
182
|
+
// Adds spaces to start and end of string to simplify further regexps.
|
|
183
|
+
this.PAD_START_OF_STR = [/^/g, ' ']
|
|
184
|
+
this.PAD_END_OF_STR = [/$/g, ' ']
|
|
185
|
+
|
|
186
|
+
// Converts double quotes to two single quotes and pad with spaces.
|
|
187
|
+
this.CONVERT_DOUBLE_TO_SINGLE_QUOTES = [/"/g, " '' "]
|
|
188
|
+
|
|
189
|
+
// Handles single quote in possessives or close-single-quote.
|
|
190
|
+
this.HANDLES_SINGLE_QUOTES = [/([^'])' /g, "$1 ' "]
|
|
191
|
+
|
|
192
|
+
// Pad apostrophe in possessive or close-single-quote.
|
|
193
|
+
this.APOSTROPHE = [/([^'])'/, "$1 ' "]
|
|
194
|
+
|
|
195
|
+
// Prepend space on contraction apostrophe.
|
|
196
|
+
this.CONTRACTION_1 = [/'([sSmMdD]) /g, " '$1 "]
|
|
197
|
+
this.CONTRACTION_2 = [/'ll /g, " 'll "]
|
|
198
|
+
this.CONTRACTION_3 = [/'re /g, " 're "]
|
|
199
|
+
this.CONTRACTION_4 = [/'ve /g, " 've "]
|
|
200
|
+
this.CONTRACTION_5 = [/n't /g, " n't "]
|
|
201
|
+
this.CONTRACTION_6 = [/'LL /g, " 'LL "]
|
|
202
|
+
this.CONTRACTION_7 = [/'RE /g, " 'RE "]
|
|
203
|
+
this.CONTRACTION_8 = [/'VE /g, " 'VE "]
|
|
204
|
+
this.CONTRACTION_9 = [/N'T /g, " N'T "]
|
|
205
|
+
|
|
206
|
+
// Informal Contractions.
|
|
207
|
+
this.CONTRACTION_10 = [/ ([Cc])annot /g, ' $1an not ']
|
|
208
|
+
this.CONTRACTION_11 = [/ ([Dd])'ye /g, " $1' ye "]
|
|
209
|
+
this.CONTRACTION_12 = [/ ([Gg])imme /g, ' $1im me ']
|
|
210
|
+
this.CONTRACTION_13 = [/ ([Gg])onna /g, ' $1on na ']
|
|
211
|
+
this.CONTRACTION_14 = [/ ([Gg])otta /g, ' $1ot ta ']
|
|
212
|
+
this.CONTRACTION_15 = [/ ([Ll])emme /g, ' $1em me ']
|
|
213
|
+
this.CONTRACTION_16 = [/ ([Mm])ore'n /g, " $1ore 'n "]
|
|
214
|
+
this.CONTRACTION_17 = [/ '([Tt])is /g, " '$1 is "]
|
|
215
|
+
this.CONTRACTION_18 = [/ '([Tt])was /g, " '$1 was "]
|
|
216
|
+
this.CONTRACTION_19 = [/ ([Ww])anna /g, ' $1an na ']
|
|
217
|
+
|
|
218
|
+
// Clean out extra spaces
|
|
219
|
+
this.CLEAN_EXTRA_SPACE_1 = [/ */g, ' ']
|
|
220
|
+
this.CLEAN_EXTRA_SPACE_2 = [/^ */g, '']
|
|
221
|
+
this.CLEAN_EXTRA_SPACE_3 = [/ *$/g, '']
|
|
222
|
+
|
|
223
|
+
// Neurotic Perl regexes to escape special characters.
|
|
224
|
+
this.ESCAPE_AMPERSAND = [/&/g, '&']
|
|
225
|
+
this.ESCAPE_PIPE = [/\|/g, '|']
|
|
226
|
+
this.ESCAPE_LEFT_ANGLE_BRACKET = [/</g, '<']
|
|
227
|
+
this.ESCAPE_RIGHT_ANGLE_BRACKET = [/>/g, '>']
|
|
228
|
+
this.ESCAPE_SINGLE_QUOTE = [/'/g, ''']
|
|
229
|
+
this.ESCAPE_DOUBLE_QUOTE = [/"/g, '"']
|
|
230
|
+
this.ESCAPE_LEFT_SQUARE_BRACKET = [/\[/g, '[']
|
|
231
|
+
this.ESCAPE_RIGHT_SQUARE_BRACKET = [/\]/g, ']']
|
|
232
|
+
|
|
233
|
+
// English-specific patterns for handling contractions and possessives
|
|
234
|
+
this.EN_SPECIFIC_1 = [
|
|
235
|
+
new RegExp(
|
|
236
|
+
`([^${this._escapeRegExp(this.IsAlpha)}])[']([^${this._escapeRegExp(
|
|
237
|
+
this.IsAlpha
|
|
238
|
+
)}])`,
|
|
239
|
+
'g'
|
|
240
|
+
),
|
|
241
|
+
"$1 ' $2"
|
|
242
|
+
]
|
|
243
|
+
this.EN_SPECIFIC_2 = [
|
|
244
|
+
new RegExp(
|
|
245
|
+
`([^${this._escapeRegExp(this.IsAlpha)}${this._escapeRegExp(
|
|
246
|
+
this.IsN
|
|
247
|
+
)}])[']([${this._escapeRegExp(this.IsAlpha)}])`,
|
|
248
|
+
'g'
|
|
249
|
+
),
|
|
250
|
+
"$1 ' $2"
|
|
251
|
+
]
|
|
252
|
+
this.EN_SPECIFIC_3 = [
|
|
253
|
+
new RegExp(
|
|
254
|
+
`([${this._escapeRegExp(this.IsAlpha)}])[']([^${this._escapeRegExp(
|
|
255
|
+
this.IsAlpha
|
|
256
|
+
)}])`,
|
|
257
|
+
'g'
|
|
258
|
+
),
|
|
259
|
+
"$1 ' $2"
|
|
260
|
+
]
|
|
261
|
+
this.EN_SPECIFIC_4 = [
|
|
262
|
+
new RegExp(
|
|
263
|
+
`([${this._escapeRegExp(this.IsAlpha)}])[']([${this._escapeRegExp(
|
|
264
|
+
this.IsAlpha
|
|
265
|
+
)}])`,
|
|
266
|
+
'g'
|
|
267
|
+
),
|
|
268
|
+
"$1 '$2"
|
|
269
|
+
]
|
|
270
|
+
this.EN_SPECIFIC_5 = [
|
|
271
|
+
new RegExp(`([${this._escapeRegExp(this.IsN)}])[']([s])`, 'g'),
|
|
272
|
+
"$1 '$2"
|
|
273
|
+
]
|
|
274
|
+
|
|
275
|
+
this.ENGLISH_SPECIFIC_APOSTROPHE = [
|
|
276
|
+
this.EN_SPECIFIC_1,
|
|
277
|
+
this.EN_SPECIFIC_2,
|
|
278
|
+
this.EN_SPECIFIC_3,
|
|
279
|
+
this.EN_SPECIFIC_4,
|
|
280
|
+
this.EN_SPECIFIC_5
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
// French/Italian specific patterns
|
|
284
|
+
this.FR_IT_SPECIFIC_1 = [
|
|
285
|
+
new RegExp(
|
|
286
|
+
`([^${this._escapeRegExp(this.IsAlpha)}])[']([^${this._escapeRegExp(
|
|
287
|
+
this.IsAlpha
|
|
288
|
+
)}])`,
|
|
289
|
+
'g'
|
|
290
|
+
),
|
|
291
|
+
"$1 ' $2"
|
|
292
|
+
]
|
|
293
|
+
this.FR_IT_SPECIFIC_2 = [
|
|
294
|
+
new RegExp(
|
|
295
|
+
`([^${this._escapeRegExp(this.IsAlpha)}])[']([${this._escapeRegExp(
|
|
296
|
+
this.IsAlpha
|
|
297
|
+
)}])`,
|
|
298
|
+
'g'
|
|
299
|
+
),
|
|
300
|
+
"$1 ' $2"
|
|
301
|
+
]
|
|
302
|
+
this.FR_IT_SPECIFIC_3 = [
|
|
303
|
+
new RegExp(
|
|
304
|
+
`([${this._escapeRegExp(this.IsAlpha)}])[']([^${this._escapeRegExp(
|
|
305
|
+
this.IsAlpha
|
|
306
|
+
)}])`,
|
|
307
|
+
'g'
|
|
308
|
+
),
|
|
309
|
+
"$1 ' $2"
|
|
310
|
+
]
|
|
311
|
+
this.FR_IT_SPECIFIC_4 = [
|
|
312
|
+
new RegExp(
|
|
313
|
+
`([${this._escapeRegExp(this.IsAlpha)}])[']([${this._escapeRegExp(
|
|
314
|
+
this.IsAlpha
|
|
315
|
+
)}])`,
|
|
316
|
+
'g'
|
|
317
|
+
),
|
|
318
|
+
"$1' $2"
|
|
319
|
+
]
|
|
320
|
+
|
|
321
|
+
this.FR_IT_SPECIFIC_APOSTROPHE = [
|
|
322
|
+
this.FR_IT_SPECIFIC_1,
|
|
323
|
+
this.FR_IT_SPECIFIC_2,
|
|
324
|
+
this.FR_IT_SPECIFIC_3,
|
|
325
|
+
this.FR_IT_SPECIFIC_4
|
|
326
|
+
]
|
|
327
|
+
|
|
328
|
+
this.NON_SPECIFIC_APOSTROPHE = [/'/g, " ' "]
|
|
329
|
+
|
|
330
|
+
this.TRAILING_DOT_APOSTROPHE = [/\.' ?$/g, " . ' "]
|
|
331
|
+
|
|
332
|
+
// Protected patterns
|
|
333
|
+
this.BASIC_PROTECTED_PATTERN_1 = /<\/?\S+\/?>/
|
|
334
|
+
this.BASIC_PROTECTED_PATTERN_2 = /<\S+( [a-zA-Z0-9]+="?[^"]*")+ ?\/?>/
|
|
335
|
+
this.BASIC_PROTECTED_PATTERN_3 = /<\S+( [a-zA-Z0-9]+='?[^']*')+ ?\/?>/
|
|
336
|
+
this.BASIC_PROTECTED_PATTERN_4 = /[\w\-_.]+@([\w\-_]+\.)+[a-zA-Z]{2,}/
|
|
337
|
+
this.BASIC_PROTECTED_PATTERN_5 =
|
|
338
|
+
/(https?|ftp):\/\/[^:/\s]+(\/\w+)*\/[\w\-.]+/
|
|
339
|
+
|
|
340
|
+
// Collected into an array for easy use
|
|
341
|
+
this.BASIC_PROTECTED_PATTERNS = [
|
|
342
|
+
this.BASIC_PROTECTED_PATTERN_1,
|
|
343
|
+
this.BASIC_PROTECTED_PATTERN_2,
|
|
344
|
+
this.BASIC_PROTECTED_PATTERN_3,
|
|
345
|
+
this.BASIC_PROTECTED_PATTERN_4,
|
|
346
|
+
this.BASIC_PROTECTED_PATTERN_5
|
|
347
|
+
]
|
|
348
|
+
|
|
349
|
+
this.WEB_PROTECTED_PATTERNS = [
|
|
350
|
+
/((https?|ftp|rsync):\/\/|www\.)[^ ]*/, // URLs
|
|
351
|
+
/[\w\-_.]+@([\w\-_]+\.)+[a-zA-Z]{2,}/, // Emails
|
|
352
|
+
/@[a-zA-Z0-9_]+/, // @handler such as twitter/github ID
|
|
353
|
+
/#[a-zA-Z0-9_]+/ // @hashtag
|
|
354
|
+
]
|
|
355
|
+
|
|
356
|
+
// Groups of regexes for different stages of tokenization
|
|
357
|
+
this.MOSES_PENN_REGEXES_1 = [
|
|
358
|
+
this.DEDUPLICATE_SPACE,
|
|
359
|
+
this.ASCII_JUNK,
|
|
360
|
+
this.DIRECTIONAL_QUOTE_1,
|
|
361
|
+
this.DIRECTIONAL_QUOTE_2,
|
|
362
|
+
this.DIRECTIONAL_QUOTE_3,
|
|
363
|
+
this.DIRECTIONAL_QUOTE_4,
|
|
364
|
+
this.DIRECTIONAL_QUOTE_5,
|
|
365
|
+
this.DIRECTIONAL_QUOTE_6,
|
|
366
|
+
this.DIRECTIONAL_QUOTE_7,
|
|
367
|
+
this.DIRECTIONAL_QUOTE_8,
|
|
368
|
+
this.REPLACE_ELLIPSIS,
|
|
369
|
+
this.COMMA_1,
|
|
370
|
+
this.COMMA_2,
|
|
371
|
+
this.COMMA_3,
|
|
372
|
+
this.SYMBOLS,
|
|
373
|
+
this.INTRATOKEN_SLASHES,
|
|
374
|
+
this.FINAL_PERIOD,
|
|
375
|
+
this.PAD_QUESTION_EXCLAMATION_MARK,
|
|
376
|
+
this.PAD_PARENTHESIS,
|
|
377
|
+
this.CONVERT_PARENTHESIS_1,
|
|
378
|
+
this.CONVERT_PARENTHESIS_2,
|
|
379
|
+
this.CONVERT_PARENTHESIS_3,
|
|
380
|
+
this.CONVERT_PARENTHESIS_4,
|
|
381
|
+
this.CONVERT_PARENTHESIS_5,
|
|
382
|
+
this.CONVERT_PARENTHESIS_6,
|
|
383
|
+
this.PAD_DOUBLE_DASHES,
|
|
384
|
+
this.PAD_START_OF_STR,
|
|
385
|
+
this.PAD_END_OF_STR,
|
|
386
|
+
this.CONVERT_DOUBLE_TO_SINGLE_QUOTES,
|
|
387
|
+
this.HANDLES_SINGLE_QUOTES,
|
|
388
|
+
this.APOSTROPHE,
|
|
389
|
+
this.CONTRACTION_1,
|
|
390
|
+
this.CONTRACTION_2,
|
|
391
|
+
this.CONTRACTION_3,
|
|
392
|
+
this.CONTRACTION_4,
|
|
393
|
+
this.CONTRACTION_5,
|
|
394
|
+
this.CONTRACTION_6,
|
|
395
|
+
this.CONTRACTION_7,
|
|
396
|
+
this.CONTRACTION_8,
|
|
397
|
+
this.CONTRACTION_9,
|
|
398
|
+
this.CONTRACTION_10,
|
|
399
|
+
this.CONTRACTION_11,
|
|
400
|
+
this.CONTRACTION_12,
|
|
401
|
+
this.CONTRACTION_13,
|
|
402
|
+
this.CONTRACTION_14,
|
|
403
|
+
this.CONTRACTION_15,
|
|
404
|
+
this.CONTRACTION_16,
|
|
405
|
+
this.CONTRACTION_17,
|
|
406
|
+
this.CONTRACTION_18,
|
|
407
|
+
this.CONTRACTION_19
|
|
408
|
+
]
|
|
409
|
+
|
|
410
|
+
this.MOSES_PENN_REGEXES_2 = [
|
|
411
|
+
this.RESTORE_ELLIPSIS,
|
|
412
|
+
this.CLEAN_EXTRA_SPACE_1,
|
|
413
|
+
this.CLEAN_EXTRA_SPACE_2,
|
|
414
|
+
this.CLEAN_EXTRA_SPACE_3,
|
|
415
|
+
this.ESCAPE_AMPERSAND,
|
|
416
|
+
this.ESCAPE_PIPE,
|
|
417
|
+
this.ESCAPE_LEFT_ANGLE_BRACKET,
|
|
418
|
+
this.ESCAPE_RIGHT_ANGLE_BRACKET,
|
|
419
|
+
this.ESCAPE_SINGLE_QUOTE,
|
|
420
|
+
this.ESCAPE_DOUBLE_QUOTE
|
|
421
|
+
]
|
|
422
|
+
|
|
423
|
+
this.MOSES_ESCAPE_XML_REGEXES = [
|
|
424
|
+
this.ESCAPE_AMPERSAND,
|
|
425
|
+
this.ESCAPE_PIPE,
|
|
426
|
+
this.ESCAPE_LEFT_ANGLE_BRACKET,
|
|
427
|
+
this.ESCAPE_RIGHT_ANGLE_BRACKET,
|
|
428
|
+
this.ESCAPE_SINGLE_QUOTE,
|
|
429
|
+
this.ESCAPE_DOUBLE_QUOTE,
|
|
430
|
+
this.ESCAPE_LEFT_SQUARE_BRACKET,
|
|
431
|
+
this.ESCAPE_RIGHT_SQUARE_BRACKET
|
|
432
|
+
]
|
|
433
|
+
|
|
434
|
+
// Initialize the language specific nonbreaking prefixes.
|
|
435
|
+
this.NONBREAKING_PREFIXES = this.nonbreaking_prefixes
|
|
436
|
+
.getWordsAsArray(lang)
|
|
437
|
+
.map((nbp) => nbp.trim())
|
|
438
|
+
|
|
439
|
+
// Load custom nonbreaking prefixes file.
|
|
440
|
+
if (customNonbreakingPrefixesFile) {
|
|
441
|
+
// In a real implementation, this would load from a file
|
|
442
|
+
this.NONBREAKING_PREFIXES = []
|
|
443
|
+
// Code to read from file would go here
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
this.NUMERIC_ONLY_PREFIXES = this.NONBREAKING_PREFIXES.filter((w) =>
|
|
447
|
+
this.hasNumericOnly(w)
|
|
448
|
+
).map((w) => w.split(' ')[0])
|
|
449
|
+
|
|
450
|
+
// Add CJK characters to alpha and alnum
|
|
451
|
+
if (['zh', 'ja', 'ko', 'cjk'].includes(this.lang)) {
|
|
452
|
+
let cjkChars = ''
|
|
453
|
+
if (['ko', 'cjk'].includes(this.lang)) {
|
|
454
|
+
cjkChars += this._joinFromGenerator(this.perluniprops.chars('Hangul'))
|
|
455
|
+
}
|
|
456
|
+
if (['zh', 'cjk'].includes(this.lang)) {
|
|
457
|
+
cjkChars += this._joinFromGenerator(this.perluniprops.chars('Han'))
|
|
458
|
+
}
|
|
459
|
+
if (['ja', 'cjk'].includes(this.lang)) {
|
|
460
|
+
cjkChars += this._joinFromGenerator(
|
|
461
|
+
this.perluniprops.chars('Hiragana')
|
|
462
|
+
)
|
|
463
|
+
cjkChars += this._joinFromGenerator(
|
|
464
|
+
this.perluniprops.chars('Katakana')
|
|
465
|
+
)
|
|
466
|
+
cjkChars += this._joinFromGenerator(this.perluniprops.chars('Han'))
|
|
467
|
+
}
|
|
468
|
+
this.IsAlpha += cjkChars
|
|
469
|
+
this.IsAlnum += cjkChars
|
|
470
|
+
|
|
471
|
+
// Overwrite the alnum regexes
|
|
472
|
+
this.PAD_NOT_ISALNUM = [
|
|
473
|
+
new RegExp(
|
|
474
|
+
`([^${this._escapeRegExp(this.IsAlnum)}\\s\\.'\`\\,\\-])`,
|
|
475
|
+
'g'
|
|
476
|
+
),
|
|
477
|
+
' $1 '
|
|
478
|
+
]
|
|
479
|
+
this.AGGRESSIVE_HYPHEN_SPLIT = [
|
|
480
|
+
new RegExp(
|
|
481
|
+
`([${this._escapeRegExp(this.IsAlnum)}])\\-(?=[${this._escapeRegExp(
|
|
482
|
+
this.IsAlnum
|
|
483
|
+
)}])`,
|
|
484
|
+
'g'
|
|
485
|
+
),
|
|
486
|
+
'$1 @-@ '
|
|
487
|
+
]
|
|
488
|
+
this.INTRATOKEN_SLASHES = [
|
|
489
|
+
new RegExp(
|
|
490
|
+
`([${this._escapeRegExp(this.IsAlnum)}])\\/([${this._escapeRegExp(
|
|
491
|
+
this.IsAlnum
|
|
492
|
+
)}])`,
|
|
493
|
+
'g'
|
|
494
|
+
),
|
|
495
|
+
'$1 @/@ $2'
|
|
496
|
+
]
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
/**
|
|
501
|
+
* Helper method to escape special characters in a string for regex
|
|
502
|
+
* @param {string} str - String to escape
|
|
503
|
+
* @returns {string} - Escaped string
|
|
504
|
+
* @private
|
|
505
|
+
*/
|
|
506
|
+
_escapeRegExp (str) {
|
|
507
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
/**
|
|
511
|
+
* Helper method to convert a generator to a string
|
|
512
|
+
* @param {Generator} generator - Generator to convert
|
|
513
|
+
* @returns {string} - Resulting string
|
|
514
|
+
* @private
|
|
515
|
+
*/
|
|
516
|
+
_joinFromGenerator (generator) {
|
|
517
|
+
let result = ''
|
|
518
|
+
for (const char of generator) {
|
|
519
|
+
result += char
|
|
520
|
+
}
|
|
521
|
+
return result
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
/**
|
|
525
|
+
* Replaces multi-dots with placeholder text
|
|
526
|
+
* @param {string} text - Input text
|
|
527
|
+
* @returns {string} - Processed text
|
|
528
|
+
*/
|
|
529
|
+
replaceMultidots (text) {
|
|
530
|
+
text = text.replace(/\.([.]+)/g, ' DOTMULTI$1')
|
|
531
|
+
const dotmulti = /DOTMULTI\./
|
|
532
|
+
while (dotmulti.test(text)) {
|
|
533
|
+
text = text.replace(/DOTMULTI\.([^.])/g, 'DOTDOTMULTI $1')
|
|
534
|
+
text = text.replace(dotmulti, 'DOTDOTMULTI')
|
|
535
|
+
}
|
|
536
|
+
return text
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
/**
|
|
540
|
+
* Restores multi-dots from placeholder text
|
|
541
|
+
* @param {string} text - Input text
|
|
542
|
+
* @returns {string} - Processed text
|
|
543
|
+
*/
|
|
544
|
+
restoreMultidots (text) {
|
|
545
|
+
const dotmulti = /DOTDOTMULTI/
|
|
546
|
+
while (dotmulti.test(text)) {
|
|
547
|
+
text = text.replace(dotmulti, 'DOTMULTI.')
|
|
548
|
+
}
|
|
549
|
+
return text.replace(/DOTMULTI/g, '.')
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
/**
|
|
553
|
+
* Check if text contains only lowercase characters
|
|
554
|
+
* @param {string} text - Input text
|
|
555
|
+
* @returns {boolean} - True if all characters are lowercase
|
|
556
|
+
*/
|
|
557
|
+
islower (text) {
|
|
558
|
+
for (let i = 0; i < text.length; i++) {
|
|
559
|
+
if (!this.IsLower.includes(text[i])) {
|
|
560
|
+
return false
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
return true
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
/**
|
|
567
|
+
* Check if text contains any alphabetic characters
|
|
568
|
+
* @param {string} text - Input text
|
|
569
|
+
* @returns {boolean} - True if any character is alphabetic
|
|
570
|
+
*/
|
|
571
|
+
isanyalpha (text) {
|
|
572
|
+
for (let i = 0; i < text.length; i++) {
|
|
573
|
+
if (this.IsAlpha.includes(text[i])) {
|
|
574
|
+
return true
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
return false
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
/**
|
|
581
|
+
* Check if text contains numeric-only marker
|
|
582
|
+
* @param {string} text - Input text
|
|
583
|
+
* @returns {boolean} - True if text has a numeric-only marker
|
|
584
|
+
*/
|
|
585
|
+
hasNumericOnly (text) {
|
|
586
|
+
return /[\s]+(#NUMERIC_ONLY#)/.test(text)
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
/**
|
|
590
|
+
* Handle nonbreaking prefixes in text
|
|
591
|
+
* @param {string} text - Input text
|
|
592
|
+
* @returns {string} - Processed text
|
|
593
|
+
*/
|
|
594
|
+
handlesNonbreakingPrefixes (text) {
|
|
595
|
+
// Splits the text into tokens to check for nonbreaking prefixes
|
|
596
|
+
const tokens = text.split(/\s+/)
|
|
597
|
+
const numTokens = tokens.length
|
|
598
|
+
|
|
599
|
+
for (let i = 0; i < numTokens; i++) {
|
|
600
|
+
const token = tokens[i]
|
|
601
|
+
// Checks if token ends with a fullstop
|
|
602
|
+
const tokenEndsWithPeriod = /^(\S+)\.$/.exec(token)
|
|
603
|
+
|
|
604
|
+
if (tokenEndsWithPeriod) {
|
|
605
|
+
const prefix = tokenEndsWithPeriod[1]
|
|
606
|
+
// Check conditions for nonbreaking prefixes
|
|
607
|
+
if (
|
|
608
|
+
(prefix.includes('.') && this.isanyalpha(prefix)) ||
|
|
609
|
+
(this.NONBREAKING_PREFIXES.includes(prefix) &&
|
|
610
|
+
!this.NUMERIC_ONLY_PREFIXES.includes(prefix)) ||
|
|
611
|
+
(i !== numTokens - 1 &&
|
|
612
|
+
tokens[i + 1] &&
|
|
613
|
+
this.islower(tokens[i + 1][0]))
|
|
614
|
+
) {
|
|
615
|
+
// No change to the token
|
|
616
|
+
} else if (
|
|
617
|
+
// Check if prefix is in NUMERIC_ONLY_PREFIXES and next token is a digit
|
|
618
|
+
this.NUMERIC_ONLY_PREFIXES.includes(prefix) &&
|
|
619
|
+
i + 1 < numTokens &&
|
|
620
|
+
/^[0-9]+/.test(tokens[i + 1])
|
|
621
|
+
) {
|
|
622
|
+
// No change to the token
|
|
623
|
+
} else {
|
|
624
|
+
// Adds a space after the tokens before a dot
|
|
625
|
+
tokens[i] = prefix + ' .'
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
return tokens.join(' ') // Stitch the tokens back
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
/**
|
|
634
|
+
* Escape XML special characters in text
|
|
635
|
+
* @param {string} text - Input text
|
|
636
|
+
* @returns {string} - Processed text
|
|
637
|
+
*/
|
|
638
|
+
escapeXml (text) {
|
|
639
|
+
for (const [regexp, substitution] of this.MOSES_ESCAPE_XML_REGEXES) {
|
|
640
|
+
text = text.replace(regexp, substitution)
|
|
641
|
+
}
|
|
642
|
+
return text
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
/**
|
|
646
|
+
* Penn Treebank tokenization
|
|
647
|
+
* @param {string} text - Input text
|
|
648
|
+
* @param {boolean} returnStr - Whether to return a string or array
|
|
649
|
+
* @returns {string|Array} - Tokenized text
|
|
650
|
+
*/
|
|
651
|
+
pennTokenize (text, returnStr = false) {
|
|
652
|
+
// Converts input string into unicode
|
|
653
|
+
text = String(text)
|
|
654
|
+
|
|
655
|
+
// Perform a chain of regex substitutions using MOSES_PENN_REGEXES_1
|
|
656
|
+
for (const [regexp, substitution] of this.MOSES_PENN_REGEXES_1) {
|
|
657
|
+
text = text.replace(regexp, substitution)
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
// Handles nonbreaking prefixes
|
|
661
|
+
text = this.handlesNonbreakingPrefixes(text)
|
|
662
|
+
|
|
663
|
+
// Restore ellipsis, clean extra spaces, escape XML symbols
|
|
664
|
+
for (const [regexp, substitution] of this.MOSES_PENN_REGEXES_2) {
|
|
665
|
+
text = text.replace(regexp, substitution)
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
return returnStr ? text : text.split(/\s+/).filter((t) => t.length > 0)
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
/**
|
|
672
|
+
* Main tokenization method
|
|
673
|
+
* @param {string} text - Input text
|
|
674
|
+
* @param {boolean} aggressiveDashSplits - Whether to aggressively split dashes
|
|
675
|
+
* @param {boolean} returnStr - Whether to return a string or array
|
|
676
|
+
* @param {boolean} escape - Whether to escape XML
|
|
677
|
+
* @param {Array} protectedPatterns - Patterns to protect from tokenization
|
|
678
|
+
* @returns {string|Array} - Tokenized text
|
|
679
|
+
*/
|
|
680
|
+
tokenize (
|
|
681
|
+
text,
|
|
682
|
+
aggressiveDashSplits = false,
|
|
683
|
+
returnStr = false,
|
|
684
|
+
escape = true,
|
|
685
|
+
protectedPatterns = null
|
|
686
|
+
) {
|
|
687
|
+
// Converts input string into unicode
|
|
688
|
+
text = String(text)
|
|
689
|
+
|
|
690
|
+
// De-duplicate spaces and clean ASCII junk
|
|
691
|
+
for (const [regexp, substitution] of [
|
|
692
|
+
this.DEDUPLICATE_SPACE,
|
|
693
|
+
this.ASCII_JUNK
|
|
694
|
+
]) {
|
|
695
|
+
text = text.replace(regexp, substitution)
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
// Initialize protectedTokens array HERE (properly scoped)
|
|
699
|
+
const protectedTokens = []
|
|
700
|
+
|
|
701
|
+
// Process protected patterns
|
|
702
|
+
if (protectedPatterns) {
|
|
703
|
+
try {
|
|
704
|
+
// Compile all patterns with global and case insensitivity flags
|
|
705
|
+
const compiledPatterns = protectedPatterns.map((p) =>
|
|
706
|
+
p instanceof RegExp
|
|
707
|
+
? new RegExp(
|
|
708
|
+
p.source,
|
|
709
|
+
p.flags.includes('g') ? p.flags : p.flags + 'g'
|
|
710
|
+
)
|
|
711
|
+
: new RegExp(p, 'gi')
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
// Find all matches across all patterns
|
|
715
|
+
compiledPatterns.forEach((pattern) => {
|
|
716
|
+
// Reset lastIndex to start from beginning
|
|
717
|
+
pattern.lastIndex = 0
|
|
718
|
+
|
|
719
|
+
// Find all matches for this pattern
|
|
720
|
+
let match
|
|
721
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
722
|
+
if (match[0].length > 0) {
|
|
723
|
+
// Skip empty matches
|
|
724
|
+
protectedTokens.push(match[0])
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
// Avoid infinite loops for zero-width matches
|
|
728
|
+
if (match.index === pattern.lastIndex) {
|
|
729
|
+
pattern.lastIndex++
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
})
|
|
733
|
+
|
|
734
|
+
// Ensure we don't exceed 1000 matches (3-digit limit)
|
|
735
|
+
if (protectedTokens.length > 1000) {
|
|
736
|
+
console.warn(
|
|
737
|
+
`More than 1000 protected tokens found (${protectedTokens.length}). Using only the first 1000.`
|
|
738
|
+
)
|
|
739
|
+
protectedTokens.length = 1000 // Truncate to 1000
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// Sort by length (longest first) to prevent substring replacements
|
|
743
|
+
const sortedTokenWithIndices = [...protectedTokens].map((token, i) => ({
|
|
744
|
+
token,
|
|
745
|
+
index: i
|
|
746
|
+
}))
|
|
747
|
+
sortedTokenWithIndices.sort((a, b) => b.token.length - a.token.length)
|
|
748
|
+
|
|
749
|
+
// Apply replacements from longest to shortest
|
|
750
|
+
for (const { token, index } of sortedTokenWithIndices) {
|
|
751
|
+
const substitution =
|
|
752
|
+
'THISISPROTECTED' + String(index).padStart(3, '0')
|
|
753
|
+
|
|
754
|
+
// Use split and join to replace all occurrences
|
|
755
|
+
text = text.split(token).join(substitution)
|
|
756
|
+
}
|
|
757
|
+
} catch (e) {
|
|
758
|
+
console.error('Error processing protected patterns:', e)
|
|
759
|
+
// Continue without protected pattern processing
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
// Strips heading and trailing spaces
|
|
764
|
+
text = text.trim()
|
|
765
|
+
|
|
766
|
+
// Separate special characters outside of IsAlnum character set
|
|
767
|
+
const [regexpNotAlnum, substitutionNotAlnum] = this.PAD_NOT_ISALNUM
|
|
768
|
+
text = text.replace(regexpNotAlnum, substitutionNotAlnum)
|
|
769
|
+
|
|
770
|
+
// Aggressively splits dashes
|
|
771
|
+
if (aggressiveDashSplits) {
|
|
772
|
+
const [regexpHyphen, substitutionHyphen] = this.AGGRESSIVE_HYPHEN_SPLIT
|
|
773
|
+
text = text.replace(regexpHyphen, substitutionHyphen)
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
// Replaces multidots with "DOTDOTMULTI" literal strings
|
|
777
|
+
text = this.replaceMultidots(text)
|
|
778
|
+
|
|
779
|
+
// Separate out "," except if within numbers e.g. 5,300
|
|
780
|
+
for (const [regexp, substitution] of [
|
|
781
|
+
this.COMMA_SEPARATE_1,
|
|
782
|
+
this.COMMA_SEPARATE_2,
|
|
783
|
+
this.COMMA_SEPARATE_3
|
|
784
|
+
]) {
|
|
785
|
+
text = text.replace(regexp, substitution)
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
// Language-specific apostrophe tokenization
|
|
789
|
+
if (this.lang === 'en') {
|
|
790
|
+
for (const [regexp, substitution] of this.ENGLISH_SPECIFIC_APOSTROPHE) {
|
|
791
|
+
text = text.replace(regexp, substitution)
|
|
792
|
+
}
|
|
793
|
+
} else if (this.lang === 'fr' || this.lang === 'it') {
|
|
794
|
+
for (const [regexp, substitution] of this.FR_IT_SPECIFIC_APOSTROPHE) {
|
|
795
|
+
text = text.replace(regexp, substitution)
|
|
796
|
+
}
|
|
797
|
+
} else {
|
|
798
|
+
const [regexp, substitution] = this.NON_SPECIFIC_APOSTROPHE
|
|
799
|
+
text = text.replace(regexp, substitution)
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
// Handles nonbreaking prefixes
|
|
803
|
+
text = this.handlesNonbreakingPrefixes(text)
|
|
804
|
+
|
|
805
|
+
// Cleans up extraneous spaces
|
|
806
|
+
const [regexpSpace, substitutionSpace] = this.DEDUPLICATE_SPACE
|
|
807
|
+
text = text.replace(regexpSpace, substitutionSpace).trim()
|
|
808
|
+
|
|
809
|
+
// Split trailing ".'".
|
|
810
|
+
const [regexpDotApostrophe, substitutionDotApostrophe] =
|
|
811
|
+
this.TRAILING_DOT_APOSTROPHE
|
|
812
|
+
text = text.replace(regexpDotApostrophe, substitutionDotApostrophe)
|
|
813
|
+
|
|
814
|
+
// Restore the protected tokens
|
|
815
|
+
if (protectedPatterns && protectedTokens.length > 0) {
|
|
816
|
+
// Process from 0 to length (the indices are embedded in the substitution strings)
|
|
817
|
+
for (let i = 0; i < protectedTokens.length; i++) {
|
|
818
|
+
const substitution = 'THISISPROTECTED' + String(i).padStart(3, '0')
|
|
819
|
+
const token = protectedTokens[i]
|
|
820
|
+
text = text.split(substitution).join(token)
|
|
821
|
+
}
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
// Restore multidots
|
|
825
|
+
text = this.restoreMultidots(text)
|
|
826
|
+
|
|
827
|
+
if (escape) {
|
|
828
|
+
// Escape XML symbols
|
|
829
|
+
text = this.escapeXml(text)
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
return returnStr ? text : text.split(/\s+/).filter((t) => t.length > 0)
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
/**
|
|
837
|
+
* MosesDetokenizer class for detokenizing text in various languages
|
|
838
|
+
*/
|
|
839
|
+
class MosesDetokenizer {
|
|
840
|
+
/**
|
|
841
|
+
* Initialize a new Moses Detokenizer
|
|
842
|
+
* @param {string} lang - Language code (default: "en")
|
|
843
|
+
*/
|
|
844
|
+
constructor (lang = 'en') {
|
|
845
|
+
this.lang = lang
|
|
846
|
+
|
|
847
|
+
// Initialize Perluniprops - choose implementation based on environment
|
|
848
|
+
this.perluniprops = new Perluniprops()
|
|
849
|
+
|
|
850
|
+
// Character sets from Perluniprops - convert generators to strings for regex use
|
|
851
|
+
this.IsAlnum = this._joinFromGenerator(this.perluniprops.chars('IsAlnum'))
|
|
852
|
+
this.IsAlpha = this._joinFromGenerator(this.perluniprops.chars('IsAlpha'))
|
|
853
|
+
this.IsSc = this._joinFromGenerator(this.perluniprops.chars('IsSc'))
|
|
854
|
+
|
|
855
|
+
// Regex patterns with their replacements
|
|
856
|
+
this.AGGRESSIVE_HYPHEN_SPLIT = [/ @-@ /g, '-']
|
|
857
|
+
|
|
858
|
+
// Merge multiple spaces
|
|
859
|
+
this.ONE_SPACE = [/ {2,}/g, ' ']
|
|
860
|
+
|
|
861
|
+
// Unescape special characters
|
|
862
|
+
this.UNESCAPE_FACTOR_SEPARATOR = [/|/g, '|']
|
|
863
|
+
this.UNESCAPE_LEFT_ANGLE_BRACKET = [/</g, '<']
|
|
864
|
+
this.UNESCAPE_RIGHT_ANGLE_BRACKET = [/>/g, '>']
|
|
865
|
+
this.UNESCAPE_DOUBLE_QUOTE = [/"/g, '"']
|
|
866
|
+
this.UNESCAPE_SINGLE_QUOTE = [/'/g, "'"]
|
|
867
|
+
this.UNESCAPE_SYNTAX_NONTERMINAL_LEFT = [/[/g, '[']
|
|
868
|
+
this.UNESCAPE_SYNTAX_NONTERMINAL_RIGHT = [/]/g, ']']
|
|
869
|
+
this.UNESCAPE_AMPERSAND = [/&/g, '&']
|
|
870
|
+
|
|
871
|
+
// Legacy regexes for older Moses versions
|
|
872
|
+
this.UNESCAPE_FACTOR_SEPARATOR_LEGACY = [/&bar;/g, '|']
|
|
873
|
+
this.UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY = [/&bra;/g, '[']
|
|
874
|
+
this.UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY = [/&ket;/g, ']']
|
|
875
|
+
|
|
876
|
+
// Group all XML unescape regexes
|
|
877
|
+
this.MOSES_UNESCAPE_XML_REGEXES = [
|
|
878
|
+
this.UNESCAPE_FACTOR_SEPARATOR_LEGACY,
|
|
879
|
+
this.UNESCAPE_FACTOR_SEPARATOR,
|
|
880
|
+
this.UNESCAPE_LEFT_ANGLE_BRACKET,
|
|
881
|
+
this.UNESCAPE_RIGHT_ANGLE_BRACKET,
|
|
882
|
+
this.UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY,
|
|
883
|
+
this.UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY,
|
|
884
|
+
this.UNESCAPE_DOUBLE_QUOTE,
|
|
885
|
+
this.UNESCAPE_SINGLE_QUOTE,
|
|
886
|
+
this.UNESCAPE_SYNTAX_NONTERMINAL_LEFT,
|
|
887
|
+
this.UNESCAPE_SYNTAX_NONTERMINAL_RIGHT,
|
|
888
|
+
this.UNESCAPE_AMPERSAND
|
|
889
|
+
]
|
|
890
|
+
|
|
891
|
+
// Finnish morphological rules
|
|
892
|
+
this.FINNISH_MORPHSET_1 = [
|
|
893
|
+
'N',
|
|
894
|
+
'n',
|
|
895
|
+
'A',
|
|
896
|
+
'a',
|
|
897
|
+
'Ä',
|
|
898
|
+
'ä',
|
|
899
|
+
'ssa',
|
|
900
|
+
'Ssa',
|
|
901
|
+
'ssä',
|
|
902
|
+
'Ssä',
|
|
903
|
+
'sta',
|
|
904
|
+
'stä',
|
|
905
|
+
'Sta',
|
|
906
|
+
'Stä',
|
|
907
|
+
'hun',
|
|
908
|
+
'Hun',
|
|
909
|
+
'hyn',
|
|
910
|
+
'Hyn',
|
|
911
|
+
'han',
|
|
912
|
+
'Han',
|
|
913
|
+
'hän',
|
|
914
|
+
'Hän',
|
|
915
|
+
'hön',
|
|
916
|
+
'Hön',
|
|
917
|
+
'un',
|
|
918
|
+
'Un',
|
|
919
|
+
'yn',
|
|
920
|
+
'Yn',
|
|
921
|
+
'an',
|
|
922
|
+
'An',
|
|
923
|
+
'än',
|
|
924
|
+
'Än',
|
|
925
|
+
'ön',
|
|
926
|
+
'Ön',
|
|
927
|
+
'seen',
|
|
928
|
+
'Seen',
|
|
929
|
+
'lla',
|
|
930
|
+
'Lla',
|
|
931
|
+
'llä',
|
|
932
|
+
'Llä',
|
|
933
|
+
'lta',
|
|
934
|
+
'Lta',
|
|
935
|
+
'ltä',
|
|
936
|
+
'Ltä',
|
|
937
|
+
'lle',
|
|
938
|
+
'Lle',
|
|
939
|
+
'ksi',
|
|
940
|
+
'Ksi',
|
|
941
|
+
'kse',
|
|
942
|
+
'Kse',
|
|
943
|
+
'tta',
|
|
944
|
+
'Tta',
|
|
945
|
+
'ine',
|
|
946
|
+
'Ine'
|
|
947
|
+
]
|
|
948
|
+
|
|
949
|
+
this.FINNISH_MORPHSET_2 = ['ni', 'si', 'mme', 'nne', 'nsa']
|
|
950
|
+
|
|
951
|
+
this.FINNISH_MORPHSET_3 = [
|
|
952
|
+
'ko',
|
|
953
|
+
'kö',
|
|
954
|
+
'han',
|
|
955
|
+
'hän',
|
|
956
|
+
'pa',
|
|
957
|
+
'pä',
|
|
958
|
+
'kaan',
|
|
959
|
+
'kään',
|
|
960
|
+
'kin'
|
|
961
|
+
]
|
|
962
|
+
|
|
963
|
+
// Combine Finnish morphsets into a regex pattern
|
|
964
|
+
this.FINNISH_REGEX = new RegExp(
|
|
965
|
+
`^(${this.FINNISH_MORPHSET_1.join('|')})(${this.FINNISH_MORPHSET_2.join(
|
|
966
|
+
'|'
|
|
967
|
+
)})?(${this.FINNISH_MORPHSET_3.join('|')})$`
|
|
968
|
+
)
|
|
969
|
+
|
|
970
|
+
// Other regex patterns for text processing
|
|
971
|
+
this.IS_CURRENCY_SYMBOL = new RegExp(
|
|
972
|
+
`^[${this._escapeRegExp(this.IsSc)}\\(\\[\\{\\¿\\¡]+$`
|
|
973
|
+
)
|
|
974
|
+
this.IS_ENGLISH_CONTRACTION = new RegExp(
|
|
975
|
+
`^['][${this._escapeRegExp(this.IsAlpha)}]`
|
|
976
|
+
)
|
|
977
|
+
this.IS_FRENCH_CONRTACTION = new RegExp(
|
|
978
|
+
`[${this._escapeRegExp(this.IsAlpha)}][']$`
|
|
979
|
+
)
|
|
980
|
+
this.STARTS_WITH_ALPHA = new RegExp(
|
|
981
|
+
`^[${this._escapeRegExp(this.IsAlpha)}]`
|
|
982
|
+
)
|
|
983
|
+
// eslint-disable-next-line no-useless-escape
|
|
984
|
+
this.IS_PUNCT = /^[\,\.\?\!\:\;\\\%\}\]\)]+$/
|
|
985
|
+
// eslint-disable-next-line no-useless-escape
|
|
986
|
+
this.IS_OPEN_QUOTE = /^[\'\"\„\"\`]+$/
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
/**
|
|
990
|
+
* Helper method to escape special characters in a string for regex
|
|
991
|
+
* @param {string} str - String to escape
|
|
992
|
+
* @returns {string} - Escaped string
|
|
993
|
+
* @private
|
|
994
|
+
*/
|
|
995
|
+
_escapeRegExp (str) {
|
|
996
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
/**
|
|
1000
|
+
* Helper method to convert a generator to a string
|
|
1001
|
+
* @param {Generator} generator - Generator to convert
|
|
1002
|
+
* @returns {string} - Resulting string
|
|
1003
|
+
* @private
|
|
1004
|
+
*/
|
|
1005
|
+
_joinFromGenerator (generator) {
|
|
1006
|
+
let result = ''
|
|
1007
|
+
for (const char of generator) {
|
|
1008
|
+
result += char
|
|
1009
|
+
}
|
|
1010
|
+
return result
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
/**
|
|
1014
|
+
* Unescape XML-escaped characters in text
|
|
1015
|
+
* @param {string} text - Input text
|
|
1016
|
+
* @returns {string} - Processed text
|
|
1017
|
+
*/
|
|
1018
|
+
unescapeXml (text) {
|
|
1019
|
+
for (const [regexp, substitution] of this.MOSES_UNESCAPE_XML_REGEXES) {
|
|
1020
|
+
text = text.replace(regexp, substitution)
|
|
1021
|
+
}
|
|
1022
|
+
return text
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
/**
|
|
1026
|
+
* Main detokenization method (named tokenize for compatibility with Python original)
|
|
1027
|
+
* @param {Array} tokens - Array of tokens to detokenize
|
|
1028
|
+
* @param {boolean} returnStr - Whether to return a string or array
|
|
1029
|
+
* @param {boolean} unescape - Whether to unescape XML
|
|
1030
|
+
* @returns {string|Array} - Detokenized text
|
|
1031
|
+
*/
|
|
1032
|
+
tokenize (tokens, returnStr = true, unescape = true) {
|
|
1033
|
+
// Convert the list of tokens into a string and pad it with spaces
|
|
1034
|
+
let text = ` ${tokens.join(' ')} `
|
|
1035
|
+
|
|
1036
|
+
// Detokenize the aggressive hyphen split
|
|
1037
|
+
const [regexpHyphen, substitutionHyphen] = this.AGGRESSIVE_HYPHEN_SPLIT
|
|
1038
|
+
text = text.replace(regexpHyphen, substitutionHyphen)
|
|
1039
|
+
|
|
1040
|
+
if (unescape) {
|
|
1041
|
+
// Unescape the XML symbols
|
|
1042
|
+
text = this.unescapeXml(text)
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// Keep track of quotation marks
|
|
1046
|
+
const quoteCounts = { "'": 0, '"': 0, '``': 0, '`': 0, "''": 0 }
|
|
1047
|
+
|
|
1048
|
+
// The prependSpace variable controls the "effects" of detokenization
|
|
1049
|
+
// as we loop through the tokens
|
|
1050
|
+
let prependSpace = ' '
|
|
1051
|
+
let detokenizedText = ''
|
|
1052
|
+
|
|
1053
|
+
// Split the text into tokens for processing
|
|
1054
|
+
const tokenArray = text.split(/\s+/).filter((t) => t.length > 0)
|
|
1055
|
+
|
|
1056
|
+
// Iterate through every token and apply language specific detokenization rules
|
|
1057
|
+
for (let i = 0; i < tokenArray.length; i++) {
|
|
1058
|
+
const token = tokenArray[i]
|
|
1059
|
+
|
|
1060
|
+
// Skip empty tokens
|
|
1061
|
+
if (!token) continue
|
|
1062
|
+
|
|
1063
|
+
// Check if the first char is CJK
|
|
1064
|
+
if (token[0] && isCJK(token[0]) && this.lang !== 'ko') {
|
|
1065
|
+
// Perform left shift if this is a second consecutive CJK word
|
|
1066
|
+
if (
|
|
1067
|
+
i > 0 &&
|
|
1068
|
+
tokenArray[i - 1] &&
|
|
1069
|
+
tokenArray[i - 1].length > 0 &&
|
|
1070
|
+
isCJK(tokenArray[i - 1][tokenArray[i - 1].length - 1])
|
|
1071
|
+
) {
|
|
1072
|
+
detokenizedText += token
|
|
1073
|
+
} else {
|
|
1074
|
+
// Nothing special if this is a CJK word that doesn't follow a CJK word
|
|
1075
|
+
detokenizedText += prependSpace + token
|
|
1076
|
+
}
|
|
1077
|
+
prependSpace = ' '
|
|
1078
|
+
} else if (this.IS_CURRENCY_SYMBOL.test(token)) {
|
|
1079
|
+
// If it's a currency symbol
|
|
1080
|
+
// Perform right shift on currency and other random punctuation items
|
|
1081
|
+
detokenizedText += prependSpace + token
|
|
1082
|
+
prependSpace = ''
|
|
1083
|
+
} else if (this.IS_PUNCT.test(token)) {
|
|
1084
|
+
// If it's a punctuation
|
|
1085
|
+
// In French, these punctuations are prefixed with a non-breakable space
|
|
1086
|
+
if (this.lang === 'fr' && /^[?!:;\\%]$/.test(token)) {
|
|
1087
|
+
detokenizedText += ' '
|
|
1088
|
+
}
|
|
1089
|
+
// Perform left shift on punctuation items
|
|
1090
|
+
detokenizedText += token
|
|
1091
|
+
prependSpace = ' '
|
|
1092
|
+
} else if (
|
|
1093
|
+
this.lang === 'en' &&
|
|
1094
|
+
i > 0 &&
|
|
1095
|
+
this.IS_ENGLISH_CONTRACTION.test(token)
|
|
1096
|
+
) {
|
|
1097
|
+
// English contractions
|
|
1098
|
+
// For English, left-shift the contraction
|
|
1099
|
+
detokenizedText += token
|
|
1100
|
+
prependSpace = ' '
|
|
1101
|
+
} else if (
|
|
1102
|
+
this.lang === 'cs' &&
|
|
1103
|
+
i > 1 &&
|
|
1104
|
+
/^[0-9]+$/.test(tokenArray[i - 2]) && // Previous previous token is a number
|
|
1105
|
+
/^[.,]$/.test(tokenArray[i - 1]) && // Previous token is a dot/comma
|
|
1106
|
+
/^[0-9]+$/.test(token) // Current token is a number
|
|
1107
|
+
) {
|
|
1108
|
+
// Czech decimal numbers
|
|
1109
|
+
// In Czech, left-shift floats that are decimal numbers
|
|
1110
|
+
detokenizedText += token
|
|
1111
|
+
prependSpace = ' '
|
|
1112
|
+
} else if (
|
|
1113
|
+
['fr', 'it', 'ga'].includes(this.lang) &&
|
|
1114
|
+
i < tokenArray.length - 1 &&
|
|
1115
|
+
this.IS_FRENCH_CONRTACTION.test(token) &&
|
|
1116
|
+
this.STARTS_WITH_ALPHA.test(tokenArray[i + 1])
|
|
1117
|
+
) {
|
|
1118
|
+
// French/Italian/Gaelic contractions
|
|
1119
|
+
// For French and Italian, right-shift the contraction
|
|
1120
|
+
detokenizedText += prependSpace + token
|
|
1121
|
+
prependSpace = ''
|
|
1122
|
+
} else if (
|
|
1123
|
+
this.lang === 'cs' &&
|
|
1124
|
+
i < tokenArray.length - 2 &&
|
|
1125
|
+
this.IS_FRENCH_CONRTACTION.test(token) &&
|
|
1126
|
+
/^[-–]$/.test(tokenArray[i + 1]) &&
|
|
1127
|
+
/^li$|^mail.*/i.test(tokenArray[i + 2])
|
|
1128
|
+
) {
|
|
1129
|
+
// Czech e-mail and -li words
|
|
1130
|
+
// In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail)
|
|
1131
|
+
detokenizedText += prependSpace + token + tokenArray[i + 1]
|
|
1132
|
+
i++ // Skip the dash token
|
|
1133
|
+
prependSpace = ''
|
|
1134
|
+
} else if (this.IS_OPEN_QUOTE.test(token)) {
|
|
1135
|
+
// Quote handling
|
|
1136
|
+
let normalizedQuo = token
|
|
1137
|
+
if (/^[„""]/.test(token)) {
|
|
1138
|
+
normalizedQuo = '"'
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
// Initialize quote count if not present
|
|
1142
|
+
quoteCounts[normalizedQuo] = quoteCounts[normalizedQuo] || 0
|
|
1143
|
+
|
|
1144
|
+
// Special handling for Czech quotes
|
|
1145
|
+
if (this.lang === 'cs' && token === '„') {
|
|
1146
|
+
quoteCounts[normalizedQuo] = 0
|
|
1147
|
+
}
|
|
1148
|
+
if (this.lang === 'cs' && token === '"') {
|
|
1149
|
+
quoteCounts[normalizedQuo] = 1
|
|
1150
|
+
}
|
|
1151
|
+
// Even count of quotes (opening quote)
|
|
1152
|
+
if (quoteCounts[normalizedQuo] % 2 === 0) {
|
|
1153
|
+
// Special case for English possessives ending in 's
|
|
1154
|
+
if (
|
|
1155
|
+
this.lang === 'en' &&
|
|
1156
|
+
token === "'" &&
|
|
1157
|
+
i > 0 &&
|
|
1158
|
+
/[s]$/.test(tokenArray[i - 1])
|
|
1159
|
+
) {
|
|
1160
|
+
// Left shift on single quote for possessives ending in "s"
|
|
1161
|
+
detokenizedText += token
|
|
1162
|
+
prependSpace = ' '
|
|
1163
|
+
} else {
|
|
1164
|
+
// Right shift for opening quotes
|
|
1165
|
+
detokenizedText += prependSpace + token
|
|
1166
|
+
prependSpace = ''
|
|
1167
|
+
quoteCounts[normalizedQuo]++
|
|
1168
|
+
}
|
|
1169
|
+
} else {
|
|
1170
|
+
// Left shift for closing quotes
|
|
1171
|
+
detokenizedText += token
|
|
1172
|
+
prependSpace = ' '
|
|
1173
|
+
quoteCounts[normalizedQuo]++
|
|
1174
|
+
}
|
|
1175
|
+
} else if (
|
|
1176
|
+
this.lang === 'fi' &&
|
|
1177
|
+
i > 0 &&
|
|
1178
|
+
/:$/.test(tokenArray[i - 1]) &&
|
|
1179
|
+
this.FINNISH_REGEX.test(token)
|
|
1180
|
+
) {
|
|
1181
|
+
// Finnish case suffixes
|
|
1182
|
+
// Finnish : without intervening space if followed by case suffix
|
|
1183
|
+
detokenizedText += prependSpace + token
|
|
1184
|
+
prependSpace = ' '
|
|
1185
|
+
} else {
|
|
1186
|
+
// Default case - just add the token with appropriate spacing
|
|
1187
|
+
detokenizedText += prependSpace + token
|
|
1188
|
+
prependSpace = ' '
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
// Merge multiple spaces
|
|
1193
|
+
const [regexpSpace, substitutionSpace] = this.ONE_SPACE
|
|
1194
|
+
detokenizedText = detokenizedText.replace(regexpSpace, substitutionSpace)
|
|
1195
|
+
|
|
1196
|
+
// Remove heading and trailing spaces
|
|
1197
|
+
detokenizedText = detokenizedText.trim()
|
|
1198
|
+
|
|
1199
|
+
return returnStr ? detokenizedText : detokenizedText.split(/\s+/)
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1202
|
+
/**
|
|
1203
|
+
* Alias for tokenize to match the original Python API
|
|
1204
|
+
* @param {Array} tokens - Array of tokens to detokenize
|
|
1205
|
+
* @param {boolean} returnStr - Whether to return a string or array
|
|
1206
|
+
* @param {boolean} unescape - Whether to unescape XML
|
|
1207
|
+
* @returns {string|Array} - Detokenized text
|
|
1208
|
+
*/
|
|
1209
|
+
detokenize (tokens, returnStr = true, unescape = true) {
|
|
1210
|
+
return this.tokenize(tokens, returnStr, unescape)
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
module.exports = {
|
|
1215
|
+
MosesTokenizer,
|
|
1216
|
+
MosesDetokenizer
|
|
1217
|
+
}
|