@qvac/translation-nmtcpp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +470 -0
  3. package/binding.js +1 -0
  4. package/index.d.ts +82 -0
  5. package/index.js +188 -0
  6. package/lib/error.js +65 -0
  7. package/marian.js +186 -0
  8. package/package.json +69 -0
  9. package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
  10. package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
  11. package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
  12. package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
  13. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
  14. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
  15. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
  16. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
  17. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
  18. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
  19. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
  20. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
  21. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
  22. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
  23. package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
  24. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
  25. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
  26. package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
  27. package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
  28. package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
  29. package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
  30. package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
  31. package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
  32. package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
  33. package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
  34. package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
  35. package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
  36. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
  37. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
  38. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
  39. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
  40. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
  41. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
  42. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
  43. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
  44. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
  45. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
  46. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
  47. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
  48. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
  49. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
  50. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
  51. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
  52. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
  53. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
  54. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
  55. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
  56. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
  57. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
  58. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
  59. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
  60. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
  61. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
  62. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
  63. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
  64. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
  65. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
  66. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
  67. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
  68. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
  69. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
  70. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
  71. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
  72. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
  73. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
  74. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
  75. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
  76. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
  77. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
  78. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
  79. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
  80. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
  81. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
  82. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
  83. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
  84. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
  85. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
  86. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
  87. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
  88. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
  89. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
  90. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
  91. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
  92. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
  93. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
  94. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
  95. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
  96. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
  97. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
  98. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
  99. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
  100. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
  101. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
  102. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
  103. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
  104. package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
  105. package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
  106. package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
  107. package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
  108. package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
  109. package/third-party/indic-processor.js +565 -0
@@ -0,0 +1,197 @@
1
+ /**
2
+ *
3
+ * Copyright (c) 2013-present, Anoop Kunchukuttan
4
+ * All rights reserved.
5
+ *
6
+ * This source code is licensed under the MIT license found in the
7
+ * INDIC_NPL_LICENCE file in the indicnlp directory of this source tree.
8
+ *
9
+ * This code is a ported version of the sacremoses library. Please refer to NOTICE
10
+ * file in the root directory of this source tree.
11
+ */
12
+
13
+ /**
14
+ * A Devanagari to Sinhala transliterator based on explicit Unicode Mapping
15
+ */
16
+ class SinhalaDevanagariTransliterator {
17
+ /**
18
+ * Map of Sinhala to Devanagari Unicode characters
19
+ */
20
+ static sinhalaDevnagMap = {
21
+ '\u0d82': '\u0902',
22
+ '\u0d83': '\u0903',
23
+ '\u0d84': '\u0904',
24
+ අ: '\u0905',
25
+ ආ: '\u0906',
26
+ ඇ: '\u090d',
27
+ ඈ: '\u090d',
28
+ ඉ: '\u0907',
29
+ ඊ: '\u0908',
30
+ උ: '\u0909',
31
+ ඌ: '\u090a',
32
+ ඍ: '\u090b',
33
+ ඏ: '\u090c',
34
+ එ: '\u090e',
35
+ ඒ: '\u090f',
36
+ ඓ: '\u0910',
37
+ ඔ: '\u0912',
38
+ ඕ: '\u0913',
39
+ ඖ: '\u0914',
40
+ ක: '\u0915',
41
+ ඛ: '\u0916',
42
+ ග: '\u0917',
43
+ ඝ: '\u0918',
44
+ ඞ: '\u0919',
45
+ ඟ: '\u0919',
46
+ ච: '\u091a',
47
+ ඡ: '\u091b',
48
+ ජ: '\u091c',
49
+ ඣ: '\u091d',
50
+ ඤ: '\u091e',
51
+ ඥ: '\u091e',
52
+ ඦ: '\u091e',
53
+ ට: '\u091f',
54
+ ඨ: '\u0920',
55
+ ඩ: '\u0921',
56
+ ඪ: '\u0922',
57
+ ණ: '\u0923',
58
+ ඬ: '\u0923',
59
+ ත: '\u0924',
60
+ ථ: '\u0925',
61
+ ද: '\u0926',
62
+ ධ: '\u0927',
63
+ න: '\u0928',
64
+ '\u0db2': '\u0928',
65
+ ඳ: '\u0928',
66
+ ප: '\u092a',
67
+ ඵ: '\u092b',
68
+ බ: '\u092c',
69
+ භ: '\u092d',
70
+ ම: '\u092e',
71
+ ය: '\u092f',
72
+ ර: '\u0930',
73
+ ල: '\u0932',
74
+ ළ: '\u0933',
75
+ ව: '\u0935',
76
+ ශ: '\u0936',
77
+ ෂ: '\u0937',
78
+ ස: '\u0938',
79
+ හ: '\u0939',
80
+ '\u0dcf': '\u093e',
81
+ '\u0dd0': '\u0949',
82
+ '\u0dd1': '\u0949',
83
+ '\u0dd2': '\u093f',
84
+ '\u0dd3': '\u0940',
85
+ '\u0dd4': '\u0941',
86
+ '\u0dd6': '\u0942',
87
+ '\u0dd8': '\u0943',
88
+ '\u0dd9': '\u0946',
89
+ '\u0dda': '\u0947',
90
+ '\u0ddb': '\u0948',
91
+ '\u0ddc': '\u094a',
92
+ '\u0ddd': '\u094b',
93
+ '\u0dde': '\u094c',
94
+ '\u0dca': '\u094d'
95
+ }
96
+
97
+ /**
98
+ * Map of Devanagari to Sinhala Unicode characters
99
+ */
100
+ static devnagSinhalaMap = {
101
+ '\u0900': '\u0d82',
102
+ '\u0901': '\u0d82',
103
+ '\u0902': '\u0d82',
104
+ '\u0903': '\u0d83',
105
+ ऄ: '\u0d84',
106
+ अ: '\u0d85',
107
+ आ: '\u0d86',
108
+ इ: '\u0d89',
109
+ ई: '\u0d8a',
110
+ उ: '\u0d8b',
111
+ ऊ: '\u0d8c',
112
+ ऋ: '\u0d8d',
113
+ ऌ: '\u0d8f',
114
+ ऍ: '\u0d88',
115
+ ऎ: '\u0d91',
116
+ ए: '\u0d92',
117
+ ऐ: '\u0d93',
118
+ ऒ: '\u0d94',
119
+ ओ: '\u0d95',
120
+ औ: '\u0d96',
121
+ क: '\u0d9a',
122
+ ख: '\u0d9b',
123
+ ग: '\u0d9c',
124
+ घ: '\u0d9d',
125
+ ङ: '\u0d9e',
126
+ च: '\u0da0',
127
+ छ: '\u0da1',
128
+ ज: '\u0da2',
129
+ झ: '\u0da3',
130
+ ञ: '\u0da4',
131
+ ट: '\u0da7',
132
+ ठ: '\u0da8',
133
+ ड: '\u0da9',
134
+ ढ: '\u0daa',
135
+ ण: '\u0dab',
136
+ त: '\u0dad',
137
+ थ: '\u0dae',
138
+ द: '\u0daf',
139
+ ध: '\u0db0',
140
+ न: '\u0db1',
141
+ ऩ: '\u0db1',
142
+ प: '\u0db4',
143
+ फ: '\u0db5',
144
+ ब: '\u0db6',
145
+ भ: '\u0db7',
146
+ म: '\u0db8',
147
+ य: '\u0dba',
148
+ र: '\u0dbb',
149
+ ल: '\u0dbd',
150
+ ळ: '\u0dc5',
151
+ व: '\u0dc0',
152
+ श: '\u0dc1',
153
+ ष: '\u0dc2',
154
+ स: '\u0dc3',
155
+ ह: '\u0dc4',
156
+ '\u093e': '\u0dcf',
157
+ '\u0949': '\u0dd1',
158
+ '\u093f': '\u0dd2',
159
+ '\u0940': '\u0dd3',
160
+ '\u0941': '\u0dd4',
161
+ '\u0942': '\u0dd6',
162
+ '\u0943': '\u0dd8',
163
+ '\u0946': '\u0dd9',
164
+ '\u0947': '\u0dda',
165
+ '\u0948': '\u0ddb',
166
+ '\u094a': '\u0ddc',
167
+ '\u094b': '\u0ddd',
168
+ '\u094c': '\u0dde',
169
+ '\u094d': '\u0dca'
170
+ }
171
+
172
+ /**
173
+ * Transliterate Devanagari text to Sinhala
174
+ * @param {string} text - Input Devanagari text
175
+ * @returns {string} - Transliterated Sinhala text
176
+ */
177
+ static devanagariToSinhala (text) {
178
+ return Array.from(text)
179
+ .map((c) => SinhalaDevanagariTransliterator.devnagSinhalaMap[c] || c)
180
+ .join('')
181
+ }
182
+
183
+ /**
184
+ * Transliterate Sinhala text to Devanagari
185
+ * @param {string} text - Input Sinhala text
186
+ * @returns {string} - Transliterated Devanagari text
187
+ */
188
+ static sinhalaToDevanagari (text) {
189
+ return Array.from(text)
190
+ .map((c) => SinhalaDevanagariTransliterator.sinhalaDevnagMap[c] || c)
191
+ .join('')
192
+ }
193
+ }
194
+
195
+ module.exports = {
196
+ SinhalaDevanagariTransliterator
197
+ }
@@ -0,0 +1,120 @@
1
+ /**
2
+ *
3
+ * Copyright (c) 2013-present, Anoop Kunchukuttan
4
+ * All rights reserved.
5
+ *
6
+ * This source code is licensed under the MIT license found in the
7
+ * INDIC_NPL_LICENCE file in the indicnlp directory of this source tree.
8
+ *
9
+ * This code is a ported version of the sacremoses library. Please refer to NOTICE
10
+ * file in the root directory of this source tree.
11
+ */
12
+
13
+ const langinfo = require('./langinfo')
14
+ const sdt = require('./sinhala_transliterator').SinhalaDevanagariTransliterator
15
+
16
+ /**
17
+ * Base class for rule-based transliteration among Indian languages.
18
+ *
19
+ * Script pair specific transliterators should derive from this class and override the transliterate() method.
20
+ * They can call the super class 'transliterate()' method to avail of the common transliteration.
21
+ */
22
+ class UnicodeIndicTransliterator {
23
+ /**
24
+ * Handle missing unaspirated and voiced plosives in Tamil script
25
+ * Replace by unvoiced, unaspirated plosives
26
+ * @param {number} offset - Character offset
27
+ * @returns {number} - Corrected offset
28
+ * @private
29
+ */
30
+ static _correctTamilMapping (offset) {
31
+ // For first 4 consonant rows of varnamala
32
+ // Exception: ja has a mapping in Tamil
33
+ if (
34
+ offset >= 0x15 &&
35
+ offset <= 0x28 &&
36
+ offset !== 0x1c &&
37
+ !((offset - 0x15) % 5 === 0 || (offset - 0x15) % 5 === 4)
38
+ ) {
39
+ const substChar = Math.floor((offset - 0x15) / 5)
40
+ offset = 0x15 + 5 * substChar
41
+ }
42
+
43
+ // For 5th consonant row of varnamala
44
+ if (offset === 0x2b || offset === 0x2c || offset === 0x2d) {
45
+ offset = 0x2a
46
+ }
47
+
48
+ // 'sh' becomes 'Sh'
49
+ if (offset === 0x36) {
50
+ offset = 0x37
51
+ }
52
+
53
+ return offset
54
+ }
55
+
56
+ /**
57
+ * Convert the source language script (lang1) to target language script (lang2)
58
+ * @param {string} text - Text to transliterate
59
+ * @param {string} lang1Code - Source language code
60
+ * @param {string} lang2Code - Target language code
61
+ * @returns {string} - Transliterated text
62
+ */
63
+ static transliterate (text, lang1Code, lang2Code) {
64
+ if (
65
+ lang1Code in langinfo.SCRIPT_RANGES &&
66
+ lang2Code in langinfo.SCRIPT_RANGES
67
+ ) {
68
+ // If Sinhala is source, do a mapping to Devanagari first
69
+ if (lang1Code === 'si') {
70
+ text = sdt.sinhalaToDevanagari(text)
71
+ lang1Code = 'hi'
72
+ }
73
+
74
+ // If Sinhala is target, make Devanagari the intermediate target
75
+ let orgLang2Code = ''
76
+ if (lang2Code === 'si') {
77
+ orgLang2Code = 'si'
78
+ lang2Code = 'hi'
79
+ }
80
+
81
+ const transLitText = []
82
+ for (const c of text) {
83
+ let newc = c
84
+ const offset = c.charCodeAt(0) - langinfo.SCRIPT_RANGES[lang1Code][0]
85
+
86
+ if (
87
+ offset >= langinfo.COORDINATED_RANGE_START_INCLUSIVE &&
88
+ offset <= langinfo.COORDINATED_RANGE_END_INCLUSIVE &&
89
+ c !== '\u0964' &&
90
+ c !== '\u0965'
91
+ ) {
92
+ let correctedOffset = offset
93
+ if (lang2Code === 'ta') {
94
+ // Tamil exceptions
95
+ correctedOffset =
96
+ UnicodeIndicTransliterator._correctTamilMapping(offset)
97
+ }
98
+
99
+ newc = String.fromCharCode(
100
+ langinfo.SCRIPT_RANGES[lang2Code][0] + correctedOffset
101
+ )
102
+ }
103
+
104
+ transLitText.push(newc)
105
+ }
106
+
107
+ // If Sinhala is target, convert from Devanagari
108
+ if (orgLang2Code === 'si') {
109
+ return sdt.devanagariToSinhala(transLitText.join(''))
110
+ }
111
+ return transLitText.join('')
112
+ } else {
113
+ return text
114
+ }
115
+ }
116
+ }
117
+
118
+ module.exports = {
119
+ UnicodeIndicTransliterator
120
+ }
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 alvations
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,202 @@
1
+ /**
2
+ * This module provides functions to detect CJK (Chinese, Japanese, Korean) characters
3
+ * based on their Unicode code points.
4
+ */
5
+
6
+ /**
7
+ * CJKChars class that enumerates the code points of CJK characters
8
+ * as listed on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
9
+ */
10
+ class CJKChars {
11
+ constructor () {
12
+ // Hangul Jamo (1100–11FF)
13
+ this.Hangul_Jamo = [0x1100, 0x11ff]
14
+
15
+ // CJK Radicals Supplement (2E80–2EFF)
16
+ // Kangxi Radicals (2F00–2FDF)
17
+ // Ideographic Description Characters (2FF0–2FFF)
18
+ // CJK Symbols and Punctuation (3000–303F)
19
+ // Hiragana (3040–309F)
20
+ // Katakana (30A0–30FF)
21
+ // Bopomofo (3100–312F)
22
+ // Hangul Compatibility Jamo (3130–318F)
23
+ // Kanbun (3190–319F)
24
+ // Bopomofo Extended (31A0–31BF)
25
+ // CJK Strokes (31C0–31EF)
26
+ // Katakana Phonetic Extensions (31F0–31FF)
27
+ // Enclosed CJK Letters and Months (3200–32FF)
28
+ // CJK Compatibility (3300–33FF)
29
+ // CJK Unified Ideographs Extension A (3400–4DBF)
30
+ // Yijing Hexagram Symbols (4DC0–4DFF)
31
+ // CJK Unified Ideographs (4E00–9FFF)
32
+ // Yi Syllables (A000–A48F)
33
+ // Yi Radicals (A490–A4CF)
34
+ this.CJK_Radicals = [0x2e80, 0xa4cf]
35
+
36
+ // Phags-pa (A840–A87F)
37
+ this.Phags_Pa = [0xa840, 0xa87f]
38
+
39
+ // Hangul Syllables (AC00–D7AF)
40
+ this.Hangul_Syllables = [0xac00, 0xd7af]
41
+
42
+ // CJK Compatibility Ideographs (F900–FAFF)
43
+ this.CJK_Compatibility_Ideographs = [0xf900, 0xfaff]
44
+
45
+ // CJK Compatibility Forms (FE30–FE4F)
46
+ this.CJK_Compatibility_Forms = [0xfe30, 0xfe4f]
47
+
48
+ // Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
49
+ this.Katakana_Hangul_Halfwidth = [0xff65, 0xffdc]
50
+
51
+ // Ideographic Symbols and Punctuation (16FE0–16FFF)
52
+ this.Ideographic_Symbols_And_Punctuation = [0x16fe0, 0x16fff]
53
+
54
+ // Tangut (17000-187FF)
55
+ // Tangut Components (18800-18AFF)
56
+ this.Tangut = [0x17000, 0x18aff]
57
+
58
+ // Kana Supplement (1B000-1B0FF)
59
+ // Kana Extended-A (1B100-1B12F)
60
+ this.Kana_Supplement = [0x1b000, 0x1b12f]
61
+
62
+ // Nushu (1B170-1B2FF)
63
+ this.Nushu = [0x1b170, 0x1b2ff]
64
+
65
+ // Supplementary Ideographic Plane (20000–2FFFF)
66
+ this.Supplementary_Ideographic_Plane = [0x20000, 0x2ffff]
67
+
68
+ // Collect all ranges in a single array
69
+ this.ranges = [
70
+ this.Hangul_Jamo,
71
+ this.CJK_Radicals,
72
+ this.Phags_Pa,
73
+ this.Hangul_Syllables,
74
+ this.CJK_Compatibility_Ideographs,
75
+ this.CJK_Compatibility_Forms,
76
+ this.Katakana_Hangul_Halfwidth,
77
+ this.Ideographic_Symbols_And_Punctuation,
78
+ this.Tangut,
79
+ this.Kana_Supplement,
80
+ this.Nushu,
81
+ this.Supplementary_Ideographic_Plane
82
+ ]
83
+ }
84
+ }
85
+
86
+ // Create a singleton instance of CJKChars
87
+ const cjkCharsRanges = new CJKChars().ranges
88
+
89
+ /**
90
+ * Checks if a character is a CJK (Chinese, Japanese, Korean) character
91
+ *
92
+ * @param {string} character - The character to check (must be a single character)
93
+ * @returns {boolean} - True if the character is a CJK character, false otherwise
94
+ *
95
+ * @example
96
+ * isCJK('世'); // true
97
+ * isCJK('A'); // false
98
+ * isCJK('ひ'); // true (Hiragana)
99
+ * isCJK('カ'); // true (Katakana)
100
+ * isCJK('한'); // true (Hangul)
101
+ */
102
+ function isCJK (character) {
103
+ // Check that we've got a single character
104
+ if (character.length !== 1) {
105
+ throw new Error('isCJK requires a single character as input')
106
+ }
107
+
108
+ // Get the Unicode code point of the character
109
+ const codePoint = character.codePointAt(0)
110
+
111
+ // Check if the code point falls within any of the CJK ranges
112
+ for (const [start, end] of cjkCharsRanges) {
113
+ if (codePoint < end) {
114
+ return codePoint >= start
115
+ }
116
+ }
117
+
118
+ return false
119
+ }
120
+
121
+ /**
122
+ * Checks if a string contains any CJK characters
123
+ *
124
+ * @param {string} text - The string to check
125
+ * @returns {boolean} - True if the string contains any CJK characters, false otherwise
126
+ *
127
+ * @example
128
+ * containsCJK('Hello 世界'); // true
129
+ * containsCJK('Hello world'); // false
130
+ */
131
+ function containsCJK (text) {
132
+ for (let i = 0; i < text.length; i++) {
133
+ // Handle surrogate pairs correctly for characters outside BMP
134
+ const char = text.charAt(i)
135
+
136
+ // Skip high surrogate if it's part of a surrogate pair
137
+ if (
138
+ i < text.length - 1 &&
139
+ char.charCodeAt(0) >= 0xd800 &&
140
+ char.charCodeAt(0) <= 0xdbff
141
+ ) {
142
+ const pair = text.substring(i, i + 2)
143
+ if (isCJK(pair)) {
144
+ return true
145
+ }
146
+ i++ // Skip the low surrogate in the next iteration
147
+ continue
148
+ }
149
+
150
+ if (isCJK(char)) {
151
+ return true
152
+ }
153
+ }
154
+
155
+ return false
156
+ }
157
+
158
+ /**
159
+ * Counts the number of CJK characters in a string
160
+ *
161
+ * @param {string} text - The string to check
162
+ * @returns {number} - The count of CJK characters in the string
163
+ *
164
+ * @example
165
+ * countCJKChars('Hello 世界'); // 2
166
+ * countCJKChars('Hello world'); // 0
167
+ */
168
+ function countCJKChars (text) {
169
+ let count = 0
170
+
171
+ for (let i = 0; i < text.length; i++) {
172
+ // Handle surrogate pairs correctly for characters outside BMP
173
+ const char = text.charAt(i)
174
+
175
+ // Skip high surrogate if it's part of a surrogate pair
176
+ if (
177
+ i < text.length - 1 &&
178
+ char.charCodeAt(0) >= 0xd800 &&
179
+ char.charCodeAt(0) <= 0xdbff
180
+ ) {
181
+ const pair = text.substring(i, i + 2)
182
+ if (isCJK(pair)) {
183
+ count++
184
+ }
185
+ i++ // Skip the low surrogate in the next iteration
186
+ continue
187
+ }
188
+
189
+ if (isCJK(char)) {
190
+ count++
191
+ }
192
+ }
193
+
194
+ return count
195
+ }
196
+
197
+ module.exports = {
198
+ CJKChars,
199
+ isCJK,
200
+ containsCJK,
201
+ countCJKChars
202
+ }
@@ -0,0 +1,8 @@
1
+ The language suffix can be found here:
2
+
3
+ http://www.loc.gov/standards/iso639-2/php/code_list.php
4
+
5
+ This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6
+ This code includes data from czech wiktionary (also czech abbreviations).
7
+
8
+
@@ -0,0 +1,65 @@
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+
3
+ #common exceptions
4
+ # Dr
5
+
6
+
7
+ #others
8
+
9
+
10
+ #phonetics
11
+ # A
12
+
13
+ # B
14
+ বি
15
+ # C
16
+ সি
17
+ # D
18
+ ডি
19
+ # E
20
+
21
+ # F
22
+ এফ
23
+ # G
24
+ জি
25
+ # H
26
+ এইচ
27
+ # I
28
+ আম
29
+ # J
30
+ জে
31
+ # K
32
+ কে
33
+ # L
34
+ এল
35
+ # M
36
+ এম
37
+ # N
38
+ এন
39
+ # O
40
+ হে
41
+ # P
42
+ পি
43
+ # Q
44
+ কিউ
45
+ # R
46
+ আর
47
+ # S
48
+ এস
49
+ # T
50
+ টি
51
+ # U
52
+ ইউ
53
+ # V
54
+ ভি
55
+ # W
56
+ ডব্লু
57
+ # X
58
+ এক্স
59
+ # Y
60
+ ওয়াই
61
+ # Z
62
+ জেড
63
+
64
+ #consonants
65
+
@@ -0,0 +1,65 @@
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+
3
+ #common exceptions
4
+ # Dr
5
+
6
+
7
+ #others
8
+
9
+
10
+ #phonetics
11
+ # A
12
+
13
+ # B
14
+ বি
15
+ # C
16
+ সি
17
+ # D
18
+ ডি
19
+ # E
20
+
21
+ # F
22
+ এফ
23
+ # G
24
+ জি
25
+ # H
26
+ এইচ
27
+ # I
28
+ আম
29
+ # J
30
+ জে
31
+ # K
32
+ কে
33
+ # L
34
+ এল
35
+ # M
36
+ এম
37
+ # N
38
+ এন
39
+ # O
40
+ হে
41
+ # P
42
+ পি
43
+ # Q
44
+ কিউ
45
+ # R
46
+ আর
47
+ # S
48
+ এস
49
+ # T
50
+ টি
51
+ # U
52
+ ইউ
53
+ # V
54
+ ভি
55
+ # W
56
+ ডব্লু
57
+ # X
58
+ এক্স
59
+ # Y
60
+ ওয়াই
61
+ # Z
62
+ জেড
63
+
64
+ #consonants
65
+