@qvac/translation-nmtcpp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +470 -0
  3. package/binding.js +1 -0
  4. package/index.d.ts +82 -0
  5. package/index.js +188 -0
  6. package/lib/error.js +65 -0
  7. package/marian.js +186 -0
  8. package/package.json +69 -0
  9. package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
  10. package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
  11. package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
  12. package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
  13. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
  14. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
  15. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
  16. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
  17. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
  18. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
  19. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
  20. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
  21. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
  22. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
  23. package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
  24. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
  25. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
  26. package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
  27. package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
  28. package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
  29. package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
  30. package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
  31. package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
  32. package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
  33. package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
  34. package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
  35. package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
  36. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
  37. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
  38. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
  39. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
  40. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
  41. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
  42. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
  43. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
  44. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
  45. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
  46. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
  47. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
  48. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
  49. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
  50. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
  51. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
  52. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
  53. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
  54. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
  55. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
  56. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
  57. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
  58. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
  59. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
  60. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
  61. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
  62. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
  63. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
  64. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
  65. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
  66. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
  67. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
  68. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
  69. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
  70. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
  71. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
  72. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
  73. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
  74. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
  75. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
  76. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
  77. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
  78. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
  79. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
  80. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
  81. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
  82. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
  83. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
  84. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
  85. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
  86. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
  87. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
  88. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
  89. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
  90. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
  91. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
  92. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
  93. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
  94. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
  95. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
  96. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
  97. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
  98. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
  99. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
  100. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
  101. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
  102. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
  103. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
  104. package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
  105. package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
  106. package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
  107. package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
  108. package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
  109. package/third-party/indic-processor.js +565 -0
@@ -0,0 +1,264 @@
1
+ /**
2
+ * JavaScript port of the Moses punctuation normalizer from
3
+ * https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
4
+ */
5
+
6
+ class MosesPunctNormalizer {
7
+ /**
8
+ * Initialize a new Moses punctuation normalizer
9
+ *
10
+ * @param {string} lang - The two-letter language code (default: "en")
11
+ * @param {Object} options - Configuration options
12
+ * @param {boolean} options.penn - Normalize Penn Treebank style quotations (default: true)
13
+ * @param {boolean} options.normQuoteCommas - Normalize quotations and commas (default: true)
14
+ * @param {boolean} options.normNumbers - Normalize numbers (default: true)
15
+ * @param {boolean} options.preReplaceUnicodePunct - Replace Unicode punctuation before normalization (default: false)
16
+ * @param {boolean} options.postRemoveControlChars - Remove control characters after normalization (default: false)
17
+ * @param {boolean} options.perlParity - Exact parity with Perl script (default: false)
18
+ */
19
+ constructor (lang = 'en', options = {}) {
20
+ // Set default options
21
+ const defaults = {
22
+ penn: true,
23
+ normQuoteCommas: true,
24
+ normNumbers: true,
25
+ preReplaceUnicodePunct: false,
26
+ postRemoveControlChars: false,
27
+ perlParity: false
28
+ }
29
+
30
+ // Merge provided options with defaults
31
+ const opts = { ...defaults, ...options }
32
+
33
+ // Extract options into variables for clarity
34
+ const {
35
+ penn,
36
+ normQuoteCommas,
37
+ normNumbers,
38
+ preReplaceUnicodePunct,
39
+ postRemoveControlChars,
40
+ perlParity
41
+ } = opts
42
+
43
+ // Define regex substitution patterns
44
+
45
+ // Extra whitespace patterns (lines 21-30)
46
+ this.EXTRA_WHITESPACE = [
47
+ [/\r/g, ''],
48
+ [/\(/g, ' ('],
49
+ [/\)/g, ') '],
50
+ [/ +/g, ' '],
51
+ [/\) ([.!:?;,])/g, ')$1'],
52
+ [/\( /g, '('],
53
+ [/ \)/g, ')'],
54
+ [/(\d) %/g, '$1%'],
55
+ [/ :/g, ':'],
56
+ [/ ;/g, ';']
57
+ ]
58
+
59
+ // Normalize Unicode if not Penn (lines 33-34)
60
+ this.NORMALIZE_UNICODE_IF_NOT_PENN = [
61
+ [/`/g, "'"],
62
+ [/''/g, ' " ']
63
+ ]
64
+
65
+ // Normalize Unicode patterns (lines 37-50)
66
+ this.NORMALIZE_UNICODE = [
67
+ [/„/g, '"'],
68
+ [/"/g, '"'],
69
+ [/"/g, '"'],
70
+ [/–/g, '-'],
71
+ [/—/g, ' - '],
72
+ [/ +/g, ' '],
73
+ [/´/g, "'"],
74
+ [/([a-zA-Z])'([a-zA-Z])/g, "$1'$2"],
75
+ [/([a-zA-Z])'([a-zA-Z])/g, "$1'$2"],
76
+ [/'/g, "'"],
77
+ [/‚/g, "'"],
78
+ [/'/g, "'"],
79
+ [/''/g, '"'],
80
+ [/´´/g, '"'],
81
+ [/…/g, '...']
82
+ ]
83
+
84
+ // French quotes patterns (lines 52-57)
85
+ this.FRENCH_QUOTES = [
86
+ [/\u00A0«\u00A0/g, '"'],
87
+ [/«\u00A0/g, '"'],
88
+ [/«/g, '"'],
89
+ [/\u00A0»\u00A0/g, '"'],
90
+ [/\u00A0»/g, '"'],
91
+ [/»/g, '"']
92
+ ]
93
+
94
+ // Handle pseudo spaces patterns (lines 59-67)
95
+ this.HANDLE_PSEUDO_SPACES = [
96
+ [/\u00A0%/g, '%'],
97
+ [/nº\u00A0/g, 'nº '],
98
+ [/\u00A0:/g, ':'],
99
+ [/\u00A0ºC/g, ' ºC'],
100
+ [/\u00A0cm/g, ' cm'],
101
+ [/\u00A0\?/g, '?'],
102
+ [/\u00A0!/g, '!'],
103
+ [/\u00A0;/g, ';'],
104
+ [/,\u00A0/g, ', '],
105
+ [/ +/g, ' ']
106
+ ]
107
+
108
+ // English quotation followed by comma patterns
109
+ this.EN_QUOTATION_FOLLOWED_BY_COMMA = [[/"([,.]+)/g, '$1"']]
110
+
111
+ // German, Spanish, French quotation followed by comma patterns
112
+ this.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [
113
+ [/,"/g, '",'],
114
+ [/(\.+)"(\s*[^<])/g, '"$1$2'] // don't fix period at end of sentence
115
+ ]
116
+
117
+ // German, Spanish, Czech, French number patterns
118
+ this.DE_ES_CZ_CS_FR = [[/(\d)\u00A0(\d)/g, '$1,$2']]
119
+
120
+ // Other number patterns
121
+ this.OTHER = [[/(\d)\u00A0(\d)/g, '$1.$2']]
122
+
123
+ // Replace Unicode punctuation patterns
124
+ this.REPLACE_UNICODE_PUNCTUATION = [
125
+ [/,/g, ','],
126
+ [/。\s*/g, '. '],
127
+ [/、/g, ','],
128
+ [/"/g, '"'],
129
+ [/"/g, '"'],
130
+ [/∶/g, ':'],
131
+ [/:/g, ':'],
132
+ [/?/g, '?'],
133
+ [/《/g, '"'],
134
+ [/》/g, '"'],
135
+ [/)/g, ')'],
136
+ [/!/g, '!'],
137
+ [/(/g, '('],
138
+ [/;/g, ';'],
139
+ [/」/g, '"'],
140
+ [/「/g, '"'],
141
+ [/0/g, '0'],
142
+ [/1/g, '1'],
143
+ [/2/g, '2'],
144
+ [/3/g, '3'],
145
+ [/4/g, '4'],
146
+ [/5/g, '5'],
147
+ [/6/g, '6'],
148
+ [/7/g, '7'],
149
+ [/8/g, '8'],
150
+ [/9/g, '9'],
151
+ [/.\s*/g, '. '],
152
+ [/~/g, '~'],
153
+ [/'/g, "'"],
154
+ [/…/g, '...'],
155
+ [/━/g, '-'],
156
+ [/〈/g, '<'],
157
+ [/〉/g, '>'],
158
+ [/【/g, '['],
159
+ [/】/g, ']'],
160
+ [/%/g, '%']
161
+ ]
162
+
163
+ // Modify patterns if perl parity is requested
164
+ if (perlParity) {
165
+ this.NORMALIZE_UNICODE[11] = [/’/g, '"'] // Only replace curved apostrophe
166
+ this.FRENCH_QUOTES[0] = [/\u00A0«\u00A0/g, ' "']
167
+ this.FRENCH_QUOTES[3] = [/\u00A0»\u00A0/g, '" ']
168
+ }
169
+
170
+ // Build the substitutions array
171
+ this.substitutions = []
172
+
173
+ // Add extra whitespace patterns
174
+ this.substitutions.push(...this.EXTRA_WHITESPACE)
175
+
176
+ // Add Penn substitutions if requested
177
+ if (penn) {
178
+ this.substitutions.push(...this.NORMALIZE_UNICODE_IF_NOT_PENN)
179
+ }
180
+
181
+ // Add normalize unicode patterns
182
+ this.substitutions.push(...this.NORMALIZE_UNICODE)
183
+
184
+ // Add French quotes patterns
185
+ this.substitutions.push(...this.FRENCH_QUOTES)
186
+
187
+ // Add pseudo spaces patterns
188
+ this.substitutions.push(...this.HANDLE_PSEUDO_SPACES)
189
+
190
+ // Add quotation-comma normalization if requested
191
+ if (normQuoteCommas) {
192
+ if (lang === 'en') {
193
+ this.substitutions.push(...this.EN_QUOTATION_FOLLOWED_BY_COMMA)
194
+ } else if (['de', 'es', 'fr'].includes(lang)) {
195
+ this.substitutions.push(...this.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)
196
+ }
197
+ }
198
+
199
+ // Add number normalization if requested
200
+ if (normNumbers) {
201
+ if (['de', 'es', 'cz', 'cs', 'fr'].includes(lang)) {
202
+ this.substitutions.push(...this.DE_ES_CZ_CS_FR)
203
+ } else {
204
+ this.substitutions.push(...this.OTHER)
205
+ }
206
+ }
207
+
208
+ this.preReplaceUnicodePunct = preReplaceUnicodePunct
209
+ this.postRemoveControlChars = postRemoveControlChars
210
+ }
211
+
212
+ /**
213
+ * Normalize punctuation in text
214
+ *
215
+ * @param {string} text - The text to normalize
216
+ * @returns {string} - The normalized text
217
+ */
218
+ normalize (text) {
219
+ // Optionally, replace unicode puncts BEFORE normalization
220
+ if (this.preReplaceUnicodePunct) {
221
+ text = this.replaceUnicodePunct(text)
222
+ }
223
+
224
+ // Actual normalization
225
+ for (const [regexp, substitution] of this.substitutions) {
226
+ text = text.replace(regexp, substitution)
227
+ }
228
+
229
+ // Optionally, remove control characters AFTER normalization
230
+ if (this.postRemoveControlChars) {
231
+ text = this.removeControlChars(text)
232
+ }
233
+
234
+ return text.trim()
235
+ }
236
+
237
+ /**
238
+ * Replace Unicode punctuation with ASCII equivalents
239
+ *
240
+ * @param {string} text - The text to process
241
+ * @returns {string} - The processed text
242
+ */
243
+ replaceUnicodePunct (text) {
244
+ for (const [regexp, substitution] of this.REPLACE_UNICODE_PUNCTUATION) {
245
+ text = text.replace(regexp, substitution)
246
+ }
247
+ return text
248
+ }
249
+
250
+ /**
251
+ * Remove control characters from text
252
+ *
253
+ * @param {string} text - The text to process
254
+ * @returns {string} - The processed text
255
+ */
256
+ removeControlChars (text) {
257
+ // JavaScript doesn't have direct equivalent to Python's regex \p{C}
258
+ // This regex removes common control characters
259
+ // eslint-disable-next-line no-control-regex
260
+ return text.replace(/[\x00-\x1F]/g, '')
261
+ }
262
+ }
263
+
264
+ module.exports = { MosesPunctNormalizer }
@@ -0,0 +1,287 @@
1
+ /**
2
+ * JavaScript port of the Perluniprops class from sacremoses
3
+ * This class is used to read lists of characters from the Perl Unicode Properties
4
+ * (see http://perldoc.perl.org/perluniprops.html).
5
+ */
6
+
7
+ const fs = require('bare-fs')
8
+
9
+ const pernuniPropsAssets = {
10
+ CJK: require.asset('./data/perluniprops/CJK.txt'),
11
+ CJKSymbols: require.asset('./data/perluniprops/CJKSymbols.txt'),
12
+ Close_Punctuation: require.asset('./data/perluniprops/Close_Punctuation.txt'),
13
+ Currency_Symbol: require.asset('./data/perluniprops/Currency_Symbol.txt'),
14
+ Han: require.asset('./data/perluniprops/Han.txt'),
15
+ Hangul: require.asset('./data/perluniprops/Hangul.txt'),
16
+ Hangul_Syllables: require.asset('./data/perluniprops/Hangul_Syllables.txt'),
17
+ Hiragana: require.asset('./data/perluniprops/Hiragana.txt'),
18
+ IsAlnum: require.asset('./data/perluniprops/IsAlnum.txt'),
19
+ 'IsAlnum-unichars-au': require.asset('./data/perluniprops/IsAlnum-unichars-au.txt'),
20
+ IsAlpha: require.asset('./data/perluniprops/IsAlpha.txt'),
21
+ 'IsAlpha-unichars-au': require.asset('./data/perluniprops/IsAlpha-unichars-au.txt'),
22
+ IsLower: require.asset('./data/perluniprops/IsLower.txt'),
23
+ IsN: require.asset('./data/perluniprops/IsN.txt'),
24
+ IsPf: require.asset('./data/perluniprops/IsPf.txt'),
25
+ IsPi: require.asset('./data/perluniprops/IsPi.txt'),
26
+ IsSc: require.asset('./data/perluniprops/IsSc.txt'),
27
+ IsSo: require.asset('./data/perluniprops/IsSo.txt'),
28
+ IsUpper: require.asset('./data/perluniprops/IsUpper.txt'),
29
+ Katakana: require.asset('./data/perluniprops/Katakana.txt'),
30
+ Line_Separator: require.asset('./data/perluniprops/Line_Separator.txt'),
31
+ Lowercase_Letter: require.asset('./data/perluniprops/Lowercase_Letter.txt'),
32
+ Number: require.asset('./data/perluniprops/Number.txt'),
33
+ Open_Punctuation: require.asset('./data/perluniprops/Open_Punctuation.txt'),
34
+ Punctuation: require.asset('./data/perluniprops/Punctuation.txt'),
35
+ Separator: require.asset('./data/perluniprops/Separator.txt'),
36
+ Symbol: require.asset('./data/perluniprops/Symbol.txt'),
37
+ Titlecase_Letter: require.asset('./data/perluniprops/Titlecase_Letter.txt'),
38
+ Uppercase_Letter: require.asset('./data/perluniprops/Uppercase_Letter.txt')
39
+ }
40
+
41
+ const nonBreakingPrefixAssets = {
42
+ 'nonbreaking_prefix.as': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.as'),
43
+ 'nonbreaking_prefix.bn': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.bn'),
44
+ 'nonbreaking_prefix.ca': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ca'),
45
+ 'nonbreaking_prefix.cs': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.cs'),
46
+ 'nonbreaking_prefix.de': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.de'),
47
+ 'nonbreaking_prefix.el': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.el'),
48
+ 'nonbreaking_prefix.en': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.en'),
49
+ 'nonbreaking_prefix.es': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.es'),
50
+ 'nonbreaking_prefix.et': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.et'),
51
+ 'nonbreaking_prefix.fi': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.fi'),
52
+ 'nonbreaking_prefix.fr': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.fr'),
53
+ 'nonbreaking_prefix.ga': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ga'),
54
+ 'nonbreaking_prefix.gu': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.gu'),
55
+ 'nonbreaking_prefix.hi': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.hi'),
56
+ 'nonbreaking_prefix.hu': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.hu'),
57
+ 'nonbreaking_prefix.is': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.is'),
58
+ 'nonbreaking_prefix.it': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.it'),
59
+ 'nonbreaking_prefix.kn': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.kn'),
60
+ 'nonbreaking_prefix.lt': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.lt'),
61
+ 'nonbreaking_prefix.lv': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.lv'),
62
+ 'nonbreaking_prefix.ml': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ml'),
63
+ 'nonbreaking_prefix.mni': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.mni'),
64
+ 'nonbreaking_prefix.mr': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.mr'),
65
+ 'nonbreaking_prefix.nl': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.nl'),
66
+ 'nonbreaking_prefix.or': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.or'),
67
+ 'nonbreaking_prefix.pa': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.pa'),
68
+ 'nonbreaking_prefix.pl': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.pl'),
69
+ 'nonbreaking_prefix.pt': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.pt'),
70
+ 'nonbreaking_prefix.ro': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ro'),
71
+ 'nonbreaking_prefix.ru': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ru'),
72
+ 'nonbreaking_prefix.sk': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.sk'),
73
+ 'nonbreaking_prefix.sl': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.sl'),
74
+ 'nonbreaking_prefix.sv': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.sv'),
75
+ 'nonbreaking_prefix.ta': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.ta'),
76
+ 'nonbreaking_prefix.tdt': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.tdt'),
77
+ 'nonbreaking_prefix.te': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.te'),
78
+ 'nonbreaking_prefix.yue': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.yue'),
79
+ 'nonbreaking_prefix.zh': require.asset('./data/nonbreaking_prefixes/nonbreaking_prefix.zh')
80
+ }
81
+
82
+ class Perluniprops {
83
+ /**
84
+ * Initialize the Perluniprops class
85
+ */
86
+ constructor () {
87
+ // Cache for loaded character sets
88
+ this._cache = {}
89
+ }
90
+
91
+ /**
92
+ * Load a character set from a file
93
+ * @param {string} category - The Unicode character category to load
94
+ * @returns {string} - A string containing all characters in the category
95
+ * @private
96
+ */
97
+ _loadCategory (category) {
98
+ const filePath = pernuniPropsAssets?.[category]
99
+
100
+ // Check if file exists
101
+ if (!filePath) {
102
+ throw new Error(`Category file not found: ${category}`)
103
+ }
104
+ // Read the file content and decode as UTF-8
105
+ const content = fs.readFileSync(filePath, { encoding: 'utf8' })
106
+
107
+ // Ensure we return a string, handle potential null/undefined
108
+ if (typeof content !== 'string') {
109
+ return ''
110
+ }
111
+
112
+ return content
113
+ }
114
+
115
+ /**
116
+ * Get characters from a specific Unicode category
117
+ * @param {string} category - The Unicode character category
118
+ * @returns {Generator} - A generator yielding characters from the category
119
+ */
120
+ * chars (category) {
121
+ // Check if category is already cached
122
+ if (!this._cache[category]) {
123
+ try {
124
+ const loadedData = this._loadCategory(category)
125
+ this._cache[category] = loadedData || ''
126
+ } catch (error) {
127
+ console.error(`Error loading category ${category}: ${error.message}`)
128
+ this._cache[category] = ''
129
+ }
130
+ }
131
+
132
+ // Ensure the cached value is iterable
133
+ const cachedData = this._cache[category]
134
+ if (typeof cachedData !== 'string' && !Array.isArray(cachedData) && typeof cachedData[Symbol.iterator] !== 'function') {
135
+ this._cache[category] = ''
136
+ return
137
+ }
138
+
139
+ // Yield each character in the category
140
+ for (const char of this._cache[category]) {
141
+ yield char
142
+ }
143
+ }
144
+ }
145
+
146
+ class NonbreakingPrefixes {
147
+ /**
148
+ * Initialize a new NonbreakingPrefixes instance
149
+ */
150
+ constructor () {
151
+ // Map of language names to language codes
152
+ this.available_langs = {
153
+ assamese: 'as',
154
+ bengali: 'bn',
155
+ catalan: 'ca',
156
+ czech: 'cs',
157
+ german: 'de',
158
+ greek: 'el',
159
+ english: 'en',
160
+ spanish: 'es',
161
+ estonian: 'et',
162
+ finnish: 'fi',
163
+ french: 'fr',
164
+ irish: 'ga',
165
+ gujarati: 'gu',
166
+ hindi: 'hi',
167
+ hungarian: 'hu',
168
+ icelandic: 'is',
169
+ italian: 'it',
170
+ kannada: 'kn',
171
+ lithuanian: 'lt',
172
+ latvian: 'lv',
173
+ malayalam: 'ml',
174
+ manipuri: 'mni',
175
+ marathi: 'mr',
176
+ dutch: 'nl',
177
+ oriya: 'or',
178
+ punjabi: 'pa',
179
+ polish: 'pl',
180
+ portuguese: 'pt',
181
+ romanian: 'ro',
182
+ russian: 'ru',
183
+ slovak: 'sk',
184
+ slovenian: 'sl',
185
+ swedish: 'sv',
186
+ tamil: 'ta',
187
+ telugu: 'te',
188
+ tetum: 'tdt',
189
+ cantonese: 'yue',
190
+ chinese: 'zh'
191
+ }
192
+
193
+ // Also add the language IDs as the keys
194
+ Object.keys(this.available_langs).forEach((key) => {
195
+ const value = this.available_langs[key]
196
+ this.available_langs[value] = value
197
+ })
198
+
199
+ // Cache for loaded prefixes
200
+ this._cache = {}
201
+ }
202
+
203
+ /**
204
+ * Load nonbreaking prefixes from a file
205
+ * @param {string} filename - The filename to load
206
+ * @param {string} ignoreLineStartswith - Lines to ignore in file
207
+ * @returns {Array<string>} - An array of nonbreaking prefixes
208
+ * @private
209
+ */
210
+ _loadFile (filename, ignoreLineStartswith = '#') {
211
+ const filePath = nonBreakingPrefixAssets?.[filename]
212
+
213
+ // Check if file exists
214
+ if (!filePath) {
215
+ console.warn(`Nonbreaking prefixes file not found: ${filename}`)
216
+ return []
217
+ }
218
+
219
+ try {
220
+ // Read the file content
221
+ const content = fs.readFileSync(filePath, { encoding: 'utf8' })
222
+
223
+ // Filter and process lines
224
+ return content
225
+ .split('\n')
226
+ .map((line) => line.trim())
227
+ .filter((line) => line && !line.startsWith(ignoreLineStartswith))
228
+ } catch (error) {
229
+ console.error(`Error reading file ${filePath}: ${error.message}`)
230
+ return []
231
+ }
232
+ }
233
+
234
+ /**
235
+ * Generator function that yields nonbreaking prefixes for the specified language(s)
236
+ * @param {string|null} lang - Language code (default: null for all languages)
237
+ * @param {string} ignoreLineStartswith - Lines to ignore in file (default: "#")
238
+ * @yields {string} - Nonbreaking prefixes
239
+ */
240
+ * words (lang = null, ignoreLineStartswith = '#') {
241
+ // Determine which files to load based on the lang parameter
242
+ let filenames = []
243
+
244
+ if (lang && lang in this.available_langs) {
245
+ // If language is available, use it
246
+ filenames = [`nonbreaking_prefix.${this.available_langs[lang]}`]
247
+ } else if (lang === null) {
248
+ // Use all languages when lang is null
249
+ const uniqueLangCodes = new Set(Object.values(this.available_langs))
250
+ filenames = Array.from(uniqueLangCodes).map(
251
+ (code) => `nonbreaking_prefix.${code}`
252
+ )
253
+ } else {
254
+ // Default to English if language not available
255
+ filenames = ['nonbreaking_prefix.en']
256
+ }
257
+
258
+ // Process each file
259
+ for (const filename of filenames) {
260
+ // Check if already cached
261
+ if (!this._cache[filename]) {
262
+ this._cache[filename] = this._loadFile(filename, ignoreLineStartswith)
263
+ }
264
+
265
+ // Yield each prefix
266
+ for (const prefix of this._cache[filename]) {
267
+ yield prefix
268
+ }
269
+ }
270
+ }
271
+
272
+ /**
273
+ * Get all nonbreaking prefixes for the specified language(s) as an array
274
+ * @param {string|null} lang - Language code
275
+ * @param {string} ignoreLineStartswith - Lines to ignore in file
276
+ * @returns {Array<string>} - An array of nonbreaking prefixes
277
+ */
278
+ getWordsAsArray (lang = null, ignoreLineStartswith = '#') {
279
+ return [...this.words(lang, ignoreLineStartswith)]
280
+ }
281
+ }
282
+
283
+ // Export both implementations
284
+ module.exports = {
285
+ Perluniprops,
286
+ NonbreakingPrefixes
287
+ }