@qvac/translation-nmtcpp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +470 -0
- package/binding.js +1 -0
- package/index.d.ts +82 -0
- package/index.js +188 -0
- package/lib/error.js +65 -0
- package/marian.js +186 -0
- package/package.json +69 -0
- package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
- package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
- package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
- package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
- package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
- package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
- package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
- package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
- package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
- package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
- package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
- package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
- package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
- package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
- package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
- package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
- package/third-party/indic-processor.js +565 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JavaScript port of the Moses punctuation normalizer from
|
|
3
|
+
* https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
class MosesPunctNormalizer {
|
|
7
|
+
/**
|
|
8
|
+
* Initialize a new Moses punctuation normalizer
|
|
9
|
+
*
|
|
10
|
+
* @param {string} lang - The two-letter language code (default: "en")
|
|
11
|
+
* @param {Object} options - Configuration options
|
|
12
|
+
* @param {boolean} options.penn - Normalize Penn Treebank style quotations (default: true)
|
|
13
|
+
* @param {boolean} options.normQuoteCommas - Normalize quotations and commas (default: true)
|
|
14
|
+
* @param {boolean} options.normNumbers - Normalize numbers (default: true)
|
|
15
|
+
* @param {boolean} options.preReplaceUnicodePunct - Replace Unicode punctuation before normalization (default: false)
|
|
16
|
+
* @param {boolean} options.postRemoveControlChars - Remove control characters after normalization (default: false)
|
|
17
|
+
* @param {boolean} options.perlParity - Exact parity with Perl script (default: false)
|
|
18
|
+
*/
|
|
19
|
+
constructor (lang = 'en', options = {}) {
|
|
20
|
+
// Set default options
|
|
21
|
+
const defaults = {
|
|
22
|
+
penn: true,
|
|
23
|
+
normQuoteCommas: true,
|
|
24
|
+
normNumbers: true,
|
|
25
|
+
preReplaceUnicodePunct: false,
|
|
26
|
+
postRemoveControlChars: false,
|
|
27
|
+
perlParity: false
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Merge provided options with defaults
|
|
31
|
+
const opts = { ...defaults, ...options }
|
|
32
|
+
|
|
33
|
+
// Extract options into variables for clarity
|
|
34
|
+
const {
|
|
35
|
+
penn,
|
|
36
|
+
normQuoteCommas,
|
|
37
|
+
normNumbers,
|
|
38
|
+
preReplaceUnicodePunct,
|
|
39
|
+
postRemoveControlChars,
|
|
40
|
+
perlParity
|
|
41
|
+
} = opts
|
|
42
|
+
|
|
43
|
+
// Define regex substitution patterns
|
|
44
|
+
|
|
45
|
+
// Extra whitespace patterns (lines 21-30)
|
|
46
|
+
this.EXTRA_WHITESPACE = [
|
|
47
|
+
[/\r/g, ''],
|
|
48
|
+
[/\(/g, ' ('],
|
|
49
|
+
[/\)/g, ') '],
|
|
50
|
+
[/ +/g, ' '],
|
|
51
|
+
[/\) ([.!:?;,])/g, ')$1'],
|
|
52
|
+
[/\( /g, '('],
|
|
53
|
+
[/ \)/g, ')'],
|
|
54
|
+
[/(\d) %/g, '$1%'],
|
|
55
|
+
[/ :/g, ':'],
|
|
56
|
+
[/ ;/g, ';']
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
// Normalize Unicode if not Penn (lines 33-34)
|
|
60
|
+
this.NORMALIZE_UNICODE_IF_NOT_PENN = [
|
|
61
|
+
[/`/g, "'"],
|
|
62
|
+
[/''/g, ' " ']
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
// Normalize Unicode patterns (lines 37-50)
|
|
66
|
+
this.NORMALIZE_UNICODE = [
|
|
67
|
+
[/„/g, '"'],
|
|
68
|
+
[/"/g, '"'],
|
|
69
|
+
[/"/g, '"'],
|
|
70
|
+
[/–/g, '-'],
|
|
71
|
+
[/—/g, ' - '],
|
|
72
|
+
[/ +/g, ' '],
|
|
73
|
+
[/´/g, "'"],
|
|
74
|
+
[/([a-zA-Z])'([a-zA-Z])/g, "$1'$2"],
|
|
75
|
+
[/([a-zA-Z])'([a-zA-Z])/g, "$1'$2"],
|
|
76
|
+
[/'/g, "'"],
|
|
77
|
+
[/‚/g, "'"],
|
|
78
|
+
[/'/g, "'"],
|
|
79
|
+
[/''/g, '"'],
|
|
80
|
+
[/´´/g, '"'],
|
|
81
|
+
[/…/g, '...']
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
// French quotes patterns (lines 52-57)
|
|
85
|
+
this.FRENCH_QUOTES = [
|
|
86
|
+
[/\u00A0«\u00A0/g, '"'],
|
|
87
|
+
[/«\u00A0/g, '"'],
|
|
88
|
+
[/«/g, '"'],
|
|
89
|
+
[/\u00A0»\u00A0/g, '"'],
|
|
90
|
+
[/\u00A0»/g, '"'],
|
|
91
|
+
[/»/g, '"']
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
// Handle pseudo spaces patterns (lines 59-67)
|
|
95
|
+
this.HANDLE_PSEUDO_SPACES = [
|
|
96
|
+
[/\u00A0%/g, '%'],
|
|
97
|
+
[/nº\u00A0/g, 'nº '],
|
|
98
|
+
[/\u00A0:/g, ':'],
|
|
99
|
+
[/\u00A0ºC/g, ' ºC'],
|
|
100
|
+
[/\u00A0cm/g, ' cm'],
|
|
101
|
+
[/\u00A0\?/g, '?'],
|
|
102
|
+
[/\u00A0!/g, '!'],
|
|
103
|
+
[/\u00A0;/g, ';'],
|
|
104
|
+
[/,\u00A0/g, ', '],
|
|
105
|
+
[/ +/g, ' ']
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
// English quotation followed by comma patterns
|
|
109
|
+
this.EN_QUOTATION_FOLLOWED_BY_COMMA = [[/"([,.]+)/g, '$1"']]
|
|
110
|
+
|
|
111
|
+
// German, Spanish, French quotation followed by comma patterns
|
|
112
|
+
this.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [
|
|
113
|
+
[/,"/g, '",'],
|
|
114
|
+
[/(\.+)"(\s*[^<])/g, '"$1$2'] // don't fix period at end of sentence
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
// German, Spanish, Czech, French number patterns
|
|
118
|
+
this.DE_ES_CZ_CS_FR = [[/(\d)\u00A0(\d)/g, '$1,$2']]
|
|
119
|
+
|
|
120
|
+
// Other number patterns
|
|
121
|
+
this.OTHER = [[/(\d)\u00A0(\d)/g, '$1.$2']]
|
|
122
|
+
|
|
123
|
+
// Replace Unicode punctuation patterns
|
|
124
|
+
this.REPLACE_UNICODE_PUNCTUATION = [
|
|
125
|
+
[/,/g, ','],
|
|
126
|
+
[/。\s*/g, '. '],
|
|
127
|
+
[/、/g, ','],
|
|
128
|
+
[/"/g, '"'],
|
|
129
|
+
[/"/g, '"'],
|
|
130
|
+
[/∶/g, ':'],
|
|
131
|
+
[/:/g, ':'],
|
|
132
|
+
[/?/g, '?'],
|
|
133
|
+
[/《/g, '"'],
|
|
134
|
+
[/》/g, '"'],
|
|
135
|
+
[/)/g, ')'],
|
|
136
|
+
[/!/g, '!'],
|
|
137
|
+
[/(/g, '('],
|
|
138
|
+
[/;/g, ';'],
|
|
139
|
+
[/」/g, '"'],
|
|
140
|
+
[/「/g, '"'],
|
|
141
|
+
[/0/g, '0'],
|
|
142
|
+
[/1/g, '1'],
|
|
143
|
+
[/2/g, '2'],
|
|
144
|
+
[/3/g, '3'],
|
|
145
|
+
[/4/g, '4'],
|
|
146
|
+
[/5/g, '5'],
|
|
147
|
+
[/6/g, '6'],
|
|
148
|
+
[/7/g, '7'],
|
|
149
|
+
[/8/g, '8'],
|
|
150
|
+
[/9/g, '9'],
|
|
151
|
+
[/.\s*/g, '. '],
|
|
152
|
+
[/~/g, '~'],
|
|
153
|
+
[/'/g, "'"],
|
|
154
|
+
[/…/g, '...'],
|
|
155
|
+
[/━/g, '-'],
|
|
156
|
+
[/〈/g, '<'],
|
|
157
|
+
[/〉/g, '>'],
|
|
158
|
+
[/【/g, '['],
|
|
159
|
+
[/】/g, ']'],
|
|
160
|
+
[/%/g, '%']
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
// Modify patterns if perl parity is requested
|
|
164
|
+
if (perlParity) {
|
|
165
|
+
this.NORMALIZE_UNICODE[11] = [/’/g, '"'] // Only replace curved apostrophe
|
|
166
|
+
this.FRENCH_QUOTES[0] = [/\u00A0«\u00A0/g, ' "']
|
|
167
|
+
this.FRENCH_QUOTES[3] = [/\u00A0»\u00A0/g, '" ']
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Build the substitutions array
|
|
171
|
+
this.substitutions = []
|
|
172
|
+
|
|
173
|
+
// Add extra whitespace patterns
|
|
174
|
+
this.substitutions.push(...this.EXTRA_WHITESPACE)
|
|
175
|
+
|
|
176
|
+
// Add Penn substitutions if requested
|
|
177
|
+
if (penn) {
|
|
178
|
+
this.substitutions.push(...this.NORMALIZE_UNICODE_IF_NOT_PENN)
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Add normalize unicode patterns
|
|
182
|
+
this.substitutions.push(...this.NORMALIZE_UNICODE)
|
|
183
|
+
|
|
184
|
+
// Add French quotes patterns
|
|
185
|
+
this.substitutions.push(...this.FRENCH_QUOTES)
|
|
186
|
+
|
|
187
|
+
// Add pseudo spaces patterns
|
|
188
|
+
this.substitutions.push(...this.HANDLE_PSEUDO_SPACES)
|
|
189
|
+
|
|
190
|
+
// Add quotation-comma normalization if requested
|
|
191
|
+
if (normQuoteCommas) {
|
|
192
|
+
if (lang === 'en') {
|
|
193
|
+
this.substitutions.push(...this.EN_QUOTATION_FOLLOWED_BY_COMMA)
|
|
194
|
+
} else if (['de', 'es', 'fr'].includes(lang)) {
|
|
195
|
+
this.substitutions.push(...this.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Add number normalization if requested
|
|
200
|
+
if (normNumbers) {
|
|
201
|
+
if (['de', 'es', 'cz', 'cs', 'fr'].includes(lang)) {
|
|
202
|
+
this.substitutions.push(...this.DE_ES_CZ_CS_FR)
|
|
203
|
+
} else {
|
|
204
|
+
this.substitutions.push(...this.OTHER)
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
this.preReplaceUnicodePunct = preReplaceUnicodePunct
|
|
209
|
+
this.postRemoveControlChars = postRemoveControlChars
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Normalize punctuation in text
|
|
214
|
+
*
|
|
215
|
+
* @param {string} text - The text to normalize
|
|
216
|
+
* @returns {string} - The normalized text
|
|
217
|
+
*/
|
|
218
|
+
normalize (text) {
|
|
219
|
+
// Optionally, replace unicode puncts BEFORE normalization
|
|
220
|
+
if (this.preReplaceUnicodePunct) {
|
|
221
|
+
text = this.replaceUnicodePunct(text)
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Actual normalization
|
|
225
|
+
for (const [regexp, substitution] of this.substitutions) {
|
|
226
|
+
text = text.replace(regexp, substitution)
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Optionally, remove control characters AFTER normalization
|
|
230
|
+
if (this.postRemoveControlChars) {
|
|
231
|
+
text = this.removeControlChars(text)
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return text.trim()
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Replace Unicode punctuation with ASCII equivalents
|
|
239
|
+
*
|
|
240
|
+
* @param {string} text - The text to process
|
|
241
|
+
* @returns {string} - The processed text
|
|
242
|
+
*/
|
|
243
|
+
replaceUnicodePunct (text) {
|
|
244
|
+
for (const [regexp, substitution] of this.REPLACE_UNICODE_PUNCTUATION) {
|
|
245
|
+
text = text.replace(regexp, substitution)
|
|
246
|
+
}
|
|
247
|
+
return text
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Remove control characters from text
|
|
252
|
+
*
|
|
253
|
+
* @param {string} text - The text to process
|
|
254
|
+
* @returns {string} - The processed text
|
|
255
|
+
*/
|
|
256
|
+
removeControlChars (text) {
|
|
257
|
+
// JavaScript doesn't have direct equivalent to Python's regex \p{C}
|
|
258
|
+
// This regex removes common control characters
|
|
259
|
+
// eslint-disable-next-line no-control-regex
|
|
260
|
+
return text.replace(/[\x00-\x1F]/g, '')
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
module.exports = { MosesPunctNormalizer }
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JavaScript port of the Perluniprops class from sacremoses
|
|
3
|
+
* This class is used to read lists of characters from the Perl Unicode Properties
|
|
4
|
+
* (see http://perldoc.perl.org/perluniprops.html).
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
const fs = require('bare-fs')
|
|
8
|
+
|
|
9
|
+
const pernuniPropsAssets = {
|
|
10
|
+
CJK: require.asset('./data/perluniprops/CJK.txt'),
|
|
11
|
+
CJKSymbols: require.asset('./data/perluniprops/CJKSymbols.txt'),
|
|
12
|
+
Close_Punctuation: require.asset('./data/perluniprops/Close_Punctuation.txt'),
|
|
13
|
+
Currency_Symbol: require.asset('./data/perluniprops/Currency_Symbol.txt'),
|
|
14
|
+
Han: require.asset('./data/perluniprops/Han.txt'),
|
|
15
|
+
Hangul: require.asset('./data/perluniprops/Hangul.txt'),
|
|
16
|
+
Hangul_Syllables: require.asset('./data/perluniprops/Hangul_Syllables.txt'),
|
|
17
|
+
Hiragana: require.asset('./data/perluniprops/Hiragana.txt'),
|
|
18
|
+
IsAlnum: require.asset('./data/perluniprops/IsAlnum.txt'),
|
|
19
|
+
'IsAlnum-unichars-au': require.asset('./data/perluniprops/IsAlnum-unichars-au.txt'),
|
|
20
|
+
IsAlpha: require.asset('./data/perluniprops/IsAlpha.txt'),
|
|
21
|
+
'IsAlpha-unichars-au': require.asset('./data/perluniprops/IsAlpha-unichars-au.txt'),
|
|
22
|
+
IsLower: require.asset('./data/perluniprops/IsLower.txt'),
|
|
23
|
+
IsN: require.asset('./data/perluniprops/IsN.txt'),
|
|
24
|
+
IsPf: require.asset('./data/perluniprops/IsPf.txt'),
|
|
25
|
+
IsPi: require.asset('./data/perluniprops/IsPi.txt'),
|
|
26
|
+
IsSc: require.asset('./data/perluniprops/IsSc.txt'),
|
|
27
|
+
IsSo: require.asset('./data/perluniprops/IsSo.txt'),
|
|
28
|
+
IsUpper: require.asset('./data/perluniprops/IsUpper.txt'),
|
|
29
|
+
Katakana: require.asset('./data/perluniprops/Katakana.txt'),
|
|
30
|
+
Line_Separator: require.asset('./data/perluniprops/Line_Separator.txt'),
|
|
31
|
+
Lowercase_Letter: require.asset('./data/perluniprops/Lowercase_Letter.txt'),
|
|
32
|
+
Number: require.asset('./data/perluniprops/Number.txt'),
|
|
33
|
+
Open_Punctuation: require.asset('./data/perluniprops/Open_Punctuation.txt'),
|
|
34
|
+
Punctuation: require.asset('./data/perluniprops/Punctuation.txt'),
|
|
35
|
+
Separator: require.asset('./data/perluniprops/Separator.txt'),
|
|
36
|
+
Symbol: require.asset('./data/perluniprops/Symbol.txt'),
|
|
37
|
+
Titlecase_Letter: require.asset('./data/perluniprops/Titlecase_Letter.txt'),
|
|
38
|
+
Uppercase_Letter: require.asset('./data/perluniprops/Uppercase_Letter.txt')
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const nonBreakingPrefixAssets = {
|
|
42
|
+
'nonbreaking_prefix.as': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.as'),
|
|
43
|
+
'nonbreaking_prefix.bn': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.bn'),
|
|
44
|
+
'nonbreaking_prefix.ca': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ca'),
|
|
45
|
+
'nonbreaking_prefix.cs': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.cs'),
|
|
46
|
+
'nonbreaking_prefix.de': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.de'),
|
|
47
|
+
'nonbreaking_prefix.el': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.el'),
|
|
48
|
+
'nonbreaking_prefix.en': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.en'),
|
|
49
|
+
'nonbreaking_prefix.es': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.es'),
|
|
50
|
+
'nonbreaking_prefix.et': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.et'),
|
|
51
|
+
'nonbreaking_prefix.fi': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.fi'),
|
|
52
|
+
'nonbreaking_prefix.fr': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.fr'),
|
|
53
|
+
'nonbreaking_prefix.ga': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ga'),
|
|
54
|
+
'nonbreaking_prefix.gu': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.gu'),
|
|
55
|
+
'nonbreaking_prefix.hi': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.hi'),
|
|
56
|
+
'nonbreaking_prefix.hu': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.hu'),
|
|
57
|
+
'nonbreaking_prefix.is': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.is'),
|
|
58
|
+
'nonbreaking_prefix.it': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.it'),
|
|
59
|
+
'nonbreaking_prefix.kn': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.kn'),
|
|
60
|
+
'nonbreaking_prefix.lt': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.lt'),
|
|
61
|
+
'nonbreaking_prefix.lv': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.lv'),
|
|
62
|
+
'nonbreaking_prefix.ml': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ml'),
|
|
63
|
+
'nonbreaking_prefix.mni': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.mni'),
|
|
64
|
+
'nonbreaking_prefix.mr': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.mr'),
|
|
65
|
+
'nonbreaking_prefix.nl': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.nl'),
|
|
66
|
+
'nonbreaking_prefix.or': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.or'),
|
|
67
|
+
'nonbreaking_prefix.pa': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.pa'),
|
|
68
|
+
'nonbreaking_prefix.pl': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.pl'),
|
|
69
|
+
'nonbreaking_prefix.pt': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.pt'),
|
|
70
|
+
'nonbreaking_prefix.ro': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ro'),
|
|
71
|
+
'nonbreaking_prefix.ru': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ru'),
|
|
72
|
+
'nonbreaking_prefix.sk': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.sk'),
|
|
73
|
+
'nonbreaking_prefix.sl': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.sl'),
|
|
74
|
+
'nonbreaking_prefix.sv': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.sv'),
|
|
75
|
+
'nonbreaking_prefix.ta': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ta'),
|
|
76
|
+
'nonbreaking_prefix.tdt': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.tdt'),
|
|
77
|
+
'nonbreaking_prefix.te': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.te'),
|
|
78
|
+
'nonbreaking_prefix.yue': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.yue'),
|
|
79
|
+
'nonbreaking_prefix.zh': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.zh')
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
class Perluniprops {
|
|
83
|
+
/**
|
|
84
|
+
* Initialize the Perluniprops class
|
|
85
|
+
*/
|
|
86
|
+
constructor () {
|
|
87
|
+
// Cache for loaded character sets
|
|
88
|
+
this._cache = {}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Load a character set from a file
|
|
93
|
+
* @param {string} category - The Unicode character category to load
|
|
94
|
+
* @returns {string} - A string containing all characters in the category
|
|
95
|
+
* @private
|
|
96
|
+
*/
|
|
97
|
+
_loadCategory (category) {
|
|
98
|
+
const filePath = pernuniPropsAssets?.[category]
|
|
99
|
+
|
|
100
|
+
// Check if file exists
|
|
101
|
+
if (!filePath) {
|
|
102
|
+
throw new Error(`Category file not found: ${category}`)
|
|
103
|
+
}
|
|
104
|
+
// Read the file content and decode as UTF-8
|
|
105
|
+
const content = fs.readFileSync(filePath, { encoding: 'utf8' })
|
|
106
|
+
|
|
107
|
+
// Ensure we return a string, handle potential null/undefined
|
|
108
|
+
if (typeof content !== 'string') {
|
|
109
|
+
return ''
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return content
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Get characters from a specific Unicode category
|
|
117
|
+
* @param {string} category - The Unicode character category
|
|
118
|
+
* @returns {Generator} - A generator yielding characters from the category
|
|
119
|
+
*/
|
|
120
|
+
* chars (category) {
|
|
121
|
+
// Check if category is already cached
|
|
122
|
+
if (!this._cache[category]) {
|
|
123
|
+
try {
|
|
124
|
+
const loadedData = this._loadCategory(category)
|
|
125
|
+
this._cache[category] = loadedData || ''
|
|
126
|
+
} catch (error) {
|
|
127
|
+
console.error(`Error loading category ${category}: ${error.message}`)
|
|
128
|
+
this._cache[category] = ''
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Ensure the cached value is iterable
|
|
133
|
+
const cachedData = this._cache[category]
|
|
134
|
+
if (typeof cachedData !== 'string' && !Array.isArray(cachedData) && typeof cachedData[Symbol.iterator] !== 'function') {
|
|
135
|
+
this._cache[category] = ''
|
|
136
|
+
return
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Yield each character in the category
|
|
140
|
+
for (const char of this._cache[category]) {
|
|
141
|
+
yield char
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
class NonbreakingPrefixes {
|
|
147
|
+
/**
|
|
148
|
+
* Initialize a new NonbreakingPrefixes instance
|
|
149
|
+
*/
|
|
150
|
+
constructor () {
|
|
151
|
+
// Map of language names to language codes
|
|
152
|
+
this.available_langs = {
|
|
153
|
+
assamese: 'as',
|
|
154
|
+
bengali: 'bn',
|
|
155
|
+
catalan: 'ca',
|
|
156
|
+
czech: 'cs',
|
|
157
|
+
german: 'de',
|
|
158
|
+
greek: 'el',
|
|
159
|
+
english: 'en',
|
|
160
|
+
spanish: 'es',
|
|
161
|
+
estonian: 'et',
|
|
162
|
+
finnish: 'fi',
|
|
163
|
+
french: 'fr',
|
|
164
|
+
irish: 'ga',
|
|
165
|
+
gujarati: 'gu',
|
|
166
|
+
hindi: 'hi',
|
|
167
|
+
hungarian: 'hu',
|
|
168
|
+
icelandic: 'is',
|
|
169
|
+
italian: 'it',
|
|
170
|
+
kannada: 'kn',
|
|
171
|
+
lithuanian: 'lt',
|
|
172
|
+
latvian: 'lv',
|
|
173
|
+
malayalam: 'ml',
|
|
174
|
+
manipuri: 'mni',
|
|
175
|
+
marathi: 'mr',
|
|
176
|
+
dutch: 'nl',
|
|
177
|
+
oriya: 'or',
|
|
178
|
+
punjabi: 'pa',
|
|
179
|
+
polish: 'pl',
|
|
180
|
+
portuguese: 'pt',
|
|
181
|
+
romanian: 'ro',
|
|
182
|
+
russian: 'ru',
|
|
183
|
+
slovak: 'sk',
|
|
184
|
+
slovenian: 'sl',
|
|
185
|
+
swedish: 'sv',
|
|
186
|
+
tamil: 'ta',
|
|
187
|
+
telugu: 'te',
|
|
188
|
+
tetum: 'tdt',
|
|
189
|
+
cantonese: 'yue',
|
|
190
|
+
chinese: 'zh'
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Also add the language IDs as the keys
|
|
194
|
+
Object.keys(this.available_langs).forEach((key) => {
|
|
195
|
+
const value = this.available_langs[key]
|
|
196
|
+
this.available_langs[value] = value
|
|
197
|
+
})
|
|
198
|
+
|
|
199
|
+
// Cache for loaded prefixes
|
|
200
|
+
this._cache = {}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Load nonbreaking prefixes from a file
|
|
205
|
+
* @param {string} filename - The filename to load
|
|
206
|
+
* @param {string} ignoreLineStartswith - Lines to ignore in file
|
|
207
|
+
* @returns {Array<string>} - An array of nonbreaking prefixes
|
|
208
|
+
* @private
|
|
209
|
+
*/
|
|
210
|
+
_loadFile (filename, ignoreLineStartswith = '#') {
|
|
211
|
+
const filePath = nonBreakingPrefixAssets?.[filename]
|
|
212
|
+
|
|
213
|
+
// Check if file exists
|
|
214
|
+
if (!filePath) {
|
|
215
|
+
console.warn(`Nonbreaking prefixes file not found: ${filename}`)
|
|
216
|
+
return []
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
try {
|
|
220
|
+
// Read the file content
|
|
221
|
+
const content = fs.readFileSync(filePath, { encoding: 'utf8' })
|
|
222
|
+
|
|
223
|
+
// Filter and process lines
|
|
224
|
+
return content
|
|
225
|
+
.split('\n')
|
|
226
|
+
.map((line) => line.trim())
|
|
227
|
+
.filter((line) => line && !line.startsWith(ignoreLineStartswith))
|
|
228
|
+
} catch (error) {
|
|
229
|
+
console.error(`Error reading file ${filePath}: ${error.message}`)
|
|
230
|
+
return []
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Generator function that yields nonbreaking prefixes for the specified language(s)
|
|
236
|
+
* @param {string|null} lang - Language code (default: null for all languages)
|
|
237
|
+
* @param {string} ignoreLineStartswith - Lines to ignore in file (default: "#")
|
|
238
|
+
* @yields {string} - Nonbreaking prefixes
|
|
239
|
+
*/
|
|
240
|
+
* words (lang = null, ignoreLineStartswith = '#') {
|
|
241
|
+
// Determine which files to load based on the lang parameter
|
|
242
|
+
let filenames = []
|
|
243
|
+
|
|
244
|
+
if (lang && lang in this.available_langs) {
|
|
245
|
+
// If language is available, use it
|
|
246
|
+
filenames = [`nonbreaking_prefix.${this.available_langs[lang]}`]
|
|
247
|
+
} else if (lang === null) {
|
|
248
|
+
// Use all languages when lang is null
|
|
249
|
+
const uniqueLangCodes = new Set(Object.values(this.available_langs))
|
|
250
|
+
filenames = Array.from(uniqueLangCodes).map(
|
|
251
|
+
(code) => `nonbreaking_prefix.${code}`
|
|
252
|
+
)
|
|
253
|
+
} else {
|
|
254
|
+
// Default to English if language not available
|
|
255
|
+
filenames = ['nonbreaking_prefix.en']
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Process each file
|
|
259
|
+
for (const filename of filenames) {
|
|
260
|
+
// Check if already cached
|
|
261
|
+
if (!this._cache[filename]) {
|
|
262
|
+
this._cache[filename] = this._loadFile(filename, ignoreLineStartswith)
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Yield each prefix
|
|
266
|
+
for (const prefix of this._cache[filename]) {
|
|
267
|
+
yield prefix
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
* Get all nonbreaking prefixes for the specified language(s) as an array
|
|
274
|
+
* @param {string|null} lang - Language code
|
|
275
|
+
* @param {string} ignoreLineStartswith - Lines to ignore in file
|
|
276
|
+
* @returns {Array<string>} - An array of nonbreaking prefixes
|
|
277
|
+
*/
|
|
278
|
+
getWordsAsArray (lang = null, ignoreLineStartswith = '#') {
|
|
279
|
+
return [...this.words(lang, ignoreLineStartswith)]
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// Export both implementations
|
|
284
|
+
module.exports = {
|
|
285
|
+
Perluniprops,
|
|
286
|
+
NonbreakingPrefixes
|
|
287
|
+
}
|