@qvac/translation-nmtcpp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +470 -0
  3. package/binding.js +1 -0
  4. package/index.d.ts +82 -0
  5. package/index.js +188 -0
  6. package/lib/error.js +65 -0
  7. package/marian.js +186 -0
  8. package/package.json +69 -0
  9. package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
  10. package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
  11. package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
  12. package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
  13. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
  14. package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
  15. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
  16. package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
  17. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
  18. package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
  19. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
  20. package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
  21. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
  22. package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
  23. package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
  24. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
  25. package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
  26. package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
  27. package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
  28. package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
  29. package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
  30. package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
  31. package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
  32. package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
  33. package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
  34. package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
  35. package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
  36. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
  37. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
  38. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
  39. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
  40. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
  41. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
  42. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
  43. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
  44. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
  45. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
  46. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
  47. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
  48. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
  49. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
  50. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
  51. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
  52. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
  53. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
  54. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
  55. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
  56. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
  57. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
  58. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
  59. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
  60. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
  61. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
  62. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
  63. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
  64. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
  65. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
  66. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
  67. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
  68. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
  69. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
  70. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
  71. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
  72. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
  73. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
  74. package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
  75. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
  76. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
  77. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
  78. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
  79. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
  80. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
  81. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
  82. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
  83. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
  84. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
  85. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
  86. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
  87. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
  88. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
  89. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
  90. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
  91. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
  92. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
  93. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
  94. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
  95. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
  96. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
  97. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
  98. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
  99. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
  100. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
  101. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
  102. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
  103. package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
  104. package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
  105. package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
  106. package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
  107. package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
  108. package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
  109. package/third-party/indic-processor.js +565 -0
@@ -0,0 +1,123 @@
1
+ /**
2
+ *
3
+ * Copyright (c) 2013-present, Anoop Kunchukuttan
4
+ * All rights reserved.
5
+ *
6
+ * This source code is licensed under the MIT license found in the
7
+ * INDIC_NPL_LICENCE file in the indicnlp directory of this source tree.
8
+ *
9
+ * This code is a ported version of the sacremoses library. Please refer to NOTICE
10
+ * file in the root directory of this source tree.
11
+ */
12
+
13
+ // Define the punctuation characters
14
+ const punctuation = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
15
+
16
+ // Tokenizer patterns
17
+ // eslint-disable-next-line no-misleading-character-class
18
+ const trivTokenizerIndicPat = new RegExp('([' + punctuation + '\u0964\u0965\uAAF1\uAAF0\uABEB\uABEC\uABED\uABEE\uABEF\u1C7E\u1C7F' + '])', 'g')
19
+
20
+ const trivTokenizerUrduPat = new RegExp(
21
+ '([' +
22
+ punctuation +
23
+ '\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4' +
24
+ '])',
25
+ 'g'
26
+ )
27
+
28
+ // Date, numbers, section/article numbering
29
+ const patNumSeq = /([0-9]+ [,.:/] )+[0-9]+/g
30
+
31
+ /**
32
+ * Tokenize string for Indian language scripts using Brahmi-derived scripts
33
+ *
34
+ * A trivial tokenizer which just tokenizes on the punctuation boundaries.
35
+ * This also includes punctuations for the Indian language scripts (the
36
+ * purna virama and the deergha virama). This is a language independent
37
+ * tokenizer.
38
+ *
39
+ * @param {string} text - text to tokenize
40
+ * @returns {Array<string>} - list of tokens
41
+ */
42
+ function trivialTokenizeIndic (text) {
43
+ // Replace punctuation with space + punctuation + space
44
+ const tokStr = text.replace(/\t/g, ' ').replace(trivTokenizerIndicPat, ' $1 ')
45
+
46
+ // Replace multiple spaces with a single space and trim
47
+ let s = tokStr.replace(/\s+/g, ' ').trim()
48
+
49
+ // Do not tokenize numbers and dates
50
+ let newS = ''
51
+ let prev = 0
52
+
53
+ // Find all number sequences and keep them together
54
+ const matches = s.matchAll(patNumSeq)
55
+ for (const m of matches) {
56
+ const start = m.index
57
+ const end = start + m[0].length
58
+
59
+ if (start > prev) {
60
+ newS += s.substring(prev, start)
61
+ newS += s.substring(start, end).replace(/ /g, '')
62
+ prev = end
63
+ }
64
+ }
65
+
66
+ newS += s.substring(prev)
67
+ s = newS
68
+
69
+ // Split the string on spaces to get tokens
70
+ return s.split(' ').filter((token) => token.length > 0)
71
+ }
72
+
73
+ /**
74
+ * Tokenize Urdu string
75
+ *
76
+ * A trivial tokenizer which just tokenizes on the punctuation boundaries.
77
+ * This also includes punctuations for the Urdu script.
78
+ * These punctuations characters were identified from the Unicode database
79
+ * for Arabic script by looking for punctuation symbols.
80
+ *
81
+ * @param {string} text - text to tokenize
82
+ * @returns {Array<string>} - list of tokens
83
+ */
84
+ function trivialTokenizeUrdu (text) {
85
+ // Replace punctuation with space + punctuation + space
86
+ const tokStr = text.replace(/\t/g, ' ').replace(trivTokenizerUrduPat, ' $1 ')
87
+
88
+ // Replace multiple spaces with a single space, trim, and split
89
+ return tokStr
90
+ .replace(/\s+/g, ' ')
91
+ .trim()
92
+ .split(' ')
93
+ .filter((token) => token.length > 0)
94
+
95
+ // Note: The Python version had a commented-out section for urduhack.
96
+ // If an equivalent JavaScript library exists, it could be used instead.
97
+ }
98
+
99
+ /**
100
+ * Trivial tokenizer for Indian languages using Brahmi or Arabic scripts
101
+ *
102
+ * A trivial tokenizer which just tokenizes on the punctuation boundaries.
103
+ * Major punctuations specific to Indian languages are handled.
104
+ * These punctuations characters were identified from the Unicode database.
105
+ *
106
+ * @param {string} text - text to tokenize
107
+ * @param {string} lang - ISO 639-2 language code (default: 'hi')
108
+ * @returns {Array<string>} - list of tokens
109
+ */
110
+ function trivialTokenize (text, lang = 'hi') {
111
+ if (lang === 'ur') {
112
+ return trivialTokenizeUrdu(text)
113
+ } else {
114
+ return trivialTokenizeIndic(text)
115
+ }
116
+ }
117
+
118
+ // Export the functions
119
+ module.exports = {
120
+ trivialTokenizeIndic,
121
+ trivialTokenizeUrdu,
122
+ trivialTokenize
123
+ }
@@ -0,0 +1,609 @@
1
+ /**
2
+ *
3
+ * Copyright (c) 2013-present, Anoop Kunchukuttan
4
+ * All rights reserved.
5
+ *
6
+ * This source code is licensed under the MIT license found in the
7
+ * INDIC_NPL_LICENCE file in the indicnlp directory of this source tree.
8
+ *
9
+ * This code is a ported version of the sacremoses library. Please refer to NOTICE
10
+ * file in the root directory of this source tree.
11
+ */
12
+
13
+ // Language codes
14
+ const LC_TA = 'ta'
15
+
16
+ const SCRIPT_RANGES = {
17
+ pa: [0x0a00, 0x0a7f],
18
+ gu: [0x0a80, 0x0aff],
19
+ or: [0x0b00, 0x0b7f],
20
+ ta: [0x0b80, 0x0bff],
21
+ te: [0x0c00, 0x0c7f],
22
+ kn: [0x0c80, 0x0cff],
23
+ ml: [0x0d00, 0x0d7f],
24
+ si: [0x0d80, 0x0dff],
25
+ hi: [0x0900, 0x097f],
26
+ mr: [0x0900, 0x097f],
27
+ kK: [0x0900, 0x097f],
28
+ sa: [0x0900, 0x097f],
29
+ ne: [0x0900, 0x097f],
30
+ sd: [0x0900, 0x097f],
31
+ bn: [0x0980, 0x09ff],
32
+ as: [0x0980, 0x09ff]
33
+ }
34
+
35
+ const DRAVIDIAN_LANGUAGES = ['ta', 'te', 'kn', 'ml']
36
+ const IE_LANGUAGES = [
37
+ 'hi',
38
+ 'mr',
39
+ 'kK',
40
+ 'sa',
41
+ 'ne',
42
+ 'sd',
43
+ 'bn',
44
+ 'as',
45
+ 'pa',
46
+ 'gu',
47
+ 'or',
48
+ 'si'
49
+ ]
50
+ const DANDA_DELIM_LANGUAGES = ['as', 'bn', 'hi', 'ne', 'or', 'pa', 'sa', 'sd']
51
+
52
+ const URDU_RANGES = [
53
+ [0x0600, 0x06ff],
54
+ [0x0750, 0x077f],
55
+ [0xfb50, 0xfdff],
56
+ [0xfe70, 0xfeff]
57
+ ]
58
+
59
+ const COORDINATED_RANGE_START_INCLUSIVE = 0
60
+ const COORDINATED_RANGE_END_INCLUSIVE = 0x6f
61
+
62
+ const NUMERIC_OFFSET_START = 0x66
63
+ const NUMERIC_OFFSET_END = 0x6f
64
+
65
+ const HALANTA_OFFSET = 0x4d
66
+ const AUM_OFFSET = 0x50
67
+ const NUKTA_OFFSET = 0x3c
68
+
69
+ const RUPEE_SIGN = 0x20b9
70
+
71
+ const DANDA = 0x0964
72
+ const DOUBLE_DANDA = 0x0965
73
+
74
+ // TODO: add missing fricatives and approximants
75
+ const VELAR_RANGE = [0x15, 0x19]
76
+ const PALATAL_RANGE = [0x1a, 0x1e]
77
+ const RETROFLEX_RANGE = [0x1f, 0x23]
78
+ const DENTAL_RANGE = [0x24, 0x29]
79
+ const LABIAL_RANGE = [0x2a, 0x2e]
80
+
81
+ // verify
82
+ const VOICED_LIST = [
83
+ 0x17, 0x18, 0x1c, 0x1d, 0x21, 0x22, 0x26, 0x27, 0x2c, 0x2d
84
+ ]
85
+ const UNVOICED_LIST = [
86
+ 0x15, 0x16, 0x1a, 0x1b, 0x1f, 0x20, 0x24, 0x25, 0x2a, 0x2b
87
+ ] // TODO: add sibilants/sonorants
88
+ const ASPIRATED_LIST = [
89
+ 0x16, 0x18, 0x1b, 0x1d, 0x20, 0x22, 0x25, 0x27, 0x2b, 0x2d
90
+ ]
91
+ const UNASPIRATED_LIST = [
92
+ 0x15, 0x17, 0x1a, 0x1c, 0x1f, 0x21, 0x24, 0x26, 0x2a, 0x2c
93
+ ]
94
+ const NASAL_LIST = [0x19, 0x1e, 0x23, 0x28, 0x29, 0x2d]
95
+ const FRICATIVE_LIST = [0x36, 0x37, 0x38]
96
+ const APPROXIMANT_LIST = [0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35]
97
+
98
+ // TODO: ha has to be properly categorized
99
+
100
+ /**
101
+ * Returns True if danda/double danda is a possible delimiter for the language
102
+ * @param {string} lang - Language code
103
+ * @returns {boolean} True if danda/double danda is a possible delimiter
104
+ */
105
+ function isDandaDelim (lang) {
106
+ return DANDA_DELIM_LANGUAGES.includes(lang)
107
+ }
108
+
109
+ /**
110
+ * Get character offset - applicable to Brahmi derived Indic scripts
111
+ * @param {string} c - Character
112
+ * @param {string} lang - Language code
113
+ * @returns {number} Character offset
114
+ */
115
+ function getOffset (c, lang) {
116
+ return c.charCodeAt(0) - SCRIPT_RANGES[lang][0]
117
+ }
118
+
119
+ /**
120
+ * Convert offset to character - applicable to Brahmi derived Indic scripts
121
+ * @param {number} c - Character offset
122
+ * @param {string} lang - Language code
123
+ * @returns {string} Character
124
+ */
125
+ function offsetToChar (c, lang) {
126
+ return String.fromCharCode(c + SCRIPT_RANGES[lang][0])
127
+ }
128
+
129
+ /**
130
+ * Check if offset is in coordinated range - applicable to Brahmi derived Indic scripts
131
+ * @param {number} cOffset - Character offset
132
+ * @returns {boolean} True if in coordinated range
133
+ */
134
+ function inCoordinatedRange (cOffset) {
135
+ return (
136
+ cOffset >= COORDINATED_RANGE_START_INCLUSIVE &&
137
+ cOffset <= COORDINATED_RANGE_END_INCLUSIVE
138
+ )
139
+ }
140
+
141
+ /**
142
+ * Check if character belongs to Indic language - applicable to Brahmi derived Indic scripts
143
+ * @param {string} c - Character
144
+ * @param {string} lang - Language code
145
+ * @returns {boolean} True if character belongs to Indic language
146
+ */
147
+ function isIndiclangChar (c, lang) {
148
+ const charCode = c.charCodeAt(0)
149
+ const o = getOffset(c, lang)
150
+ return (
151
+ (o >= 0 && o <= 0x7f) || charCode === DANDA || charCode === DOUBLE_DANDA
152
+ )
153
+ }
154
+
155
+ /**
156
+ * Is the character a vowel
157
+ * @param {string} c - Character
158
+ * @param {string} lang - Language code
159
+ * @returns {boolean} True if character is a vowel
160
+ */
161
+ function isVowel (c, lang) {
162
+ const o = getOffset(c, lang)
163
+ return o >= 0x04 && o <= 0x14
164
+ }
165
+
166
+ /**
167
+ * Is the character a vowel sign (maatraa)
168
+ * @param {string} c - Character
169
+ * @param {string} lang - Language code
170
+ * @returns {boolean} True if character is a vowel sign
171
+ */
172
+ function isVowelSign (c, lang) {
173
+ const o = getOffset(c, lang)
174
+ return o >= 0x3e && o <= 0x4c
175
+ }
176
+
177
+ /**
178
+ * Is the character the halanta character
179
+ * @param {string} c - Character
180
+ * @param {string} lang - Language code
181
+ * @returns {boolean} True if character is halanta
182
+ */
183
+ function isHalanta (c, lang) {
184
+ const o = getOffset(c, lang)
185
+ return o === HALANTA_OFFSET
186
+ }
187
+
188
+ /**
189
+ * Is the character the nukta character
190
+ * @param {string} c - Character
191
+ * @param {string} lang - Language code
192
+ * @returns {boolean} True if character is nukta
193
+ */
194
+ function isNukta (c, lang) {
195
+ const o = getOffset(c, lang)
196
+ return o === NUKTA_OFFSET
197
+ }
198
+
199
+ /**
200
+ * Is the character the aum character
201
+ * @param {string} c - Character
202
+ * @param {string} lang - Language code
203
+ * @returns {boolean} True if character is aum
204
+ */
205
+ function isAum (c, lang) {
206
+ const o = getOffset(c, lang)
207
+ return o === AUM_OFFSET
208
+ }
209
+
210
+ /**
211
+ * Is the character a consonant
212
+ * @param {string} c - Character
213
+ * @param {string} lang - Language code
214
+ * @returns {boolean} True if character is a consonant
215
+ */
216
+ function isConsonant (c, lang) {
217
+ const o = getOffset(c, lang)
218
+ return o >= 0x15 && o <= 0x39
219
+ }
220
+
221
+ /**
222
+ * Is the character a velar
223
+ * @param {string} c - Character
224
+ * @param {string} lang - Language code
225
+ * @returns {boolean} True if character is a velar
226
+ */
227
+ function isVelar (c, lang) {
228
+ const o = getOffset(c, lang)
229
+ return o >= VELAR_RANGE[0] && o <= VELAR_RANGE[1]
230
+ }
231
+
232
+ /**
233
+ * Is the character a palatal
234
+ * @param {string} c - Character
235
+ * @param {string} lang - Language code
236
+ * @returns {boolean} True if character is a palatal
237
+ */
238
+ function isPalatal (c, lang) {
239
+ const o = getOffset(c, lang)
240
+ return o >= PALATAL_RANGE[0] && o <= PALATAL_RANGE[1]
241
+ }
242
+
243
+ /**
244
+ * Is the character a retroflex
245
+ * @param {string} c - Character
246
+ * @param {string} lang - Language code
247
+ * @returns {boolean} True if character is a retroflex
248
+ */
249
+ function isRetroflex (c, lang) {
250
+ const o = getOffset(c, lang)
251
+ return o >= RETROFLEX_RANGE[0] && o <= RETROFLEX_RANGE[1]
252
+ }
253
+
254
+ /**
255
+ * Is the character a dental
256
+ * @param {string} c - Character
257
+ * @param {string} lang - Language code
258
+ * @returns {boolean} True if character is a dental
259
+ */
260
+ function isDental (c, lang) {
261
+ const o = getOffset(c, lang)
262
+ return o >= DENTAL_RANGE[0] && o <= DENTAL_RANGE[1]
263
+ }
264
+
265
+ /**
266
+ * Is the character a labial
267
+ * @param {string} c - Character
268
+ * @param {string} lang - Language code
269
+ * @returns {boolean} True if character is a labial
270
+ */
271
+ function isLabial (c, lang) {
272
+ const o = getOffset(c, lang)
273
+ return o >= LABIAL_RANGE[0] && o <= LABIAL_RANGE[1]
274
+ }
275
+
276
+ /**
277
+ * Is the character a voiced consonant
278
+ * @param {string} c - Character
279
+ * @param {string} lang - Language code
280
+ * @returns {boolean} True if character is a voiced consonant
281
+ */
282
+ function isVoiced (c, lang) {
283
+ const o = getOffset(c, lang)
284
+ return VOICED_LIST.includes(o)
285
+ }
286
+
287
+ /**
288
+ * Is the character an unvoiced consonant
289
+ * @param {string} c - Character
290
+ * @param {string} lang - Language code
291
+ * @returns {boolean} True if character is an unvoiced consonant
292
+ */
293
+ function isUnvoiced (c, lang) {
294
+ const o = getOffset(c, lang)
295
+ return UNVOICED_LIST.includes(o)
296
+ }
297
+
298
+ /**
299
+ * Is the character an aspirated consonant
300
+ * @param {string} c - Character
301
+ * @param {string} lang - Language code
302
+ * @returns {boolean} True if character is an aspirated consonant
303
+ */
304
+ function isAspirated (c, lang) {
305
+ const o = getOffset(c, lang)
306
+ return ASPIRATED_LIST.includes(o)
307
+ }
308
+
309
+ /**
310
+ * Is the character an unaspirated consonant
311
+ * @param {string} c - Character
312
+ * @param {string} lang - Language code
313
+ * @returns {boolean} True if character is an unaspirated consonant
314
+ */
315
+ function isUnaspirated (c, lang) {
316
+ const o = getOffset(c, lang)
317
+ return UNASPIRATED_LIST.includes(o)
318
+ }
319
+
320
+ /**
321
+ * Is the character a nasal consonant
322
+ * @param {string} c - Character
323
+ * @param {string} lang - Language code
324
+ * @returns {boolean} True if character is a nasal consonant
325
+ */
326
+ function isNasal (c, lang) {
327
+ const o = getOffset(c, lang)
328
+ return NASAL_LIST.includes(o)
329
+ }
330
+
331
+ /**
332
+ * Is the character a fricative consonant
333
+ * @param {string} c - Character
334
+ * @param {string} lang - Language code
335
+ * @returns {boolean} True if character is a fricative consonant
336
+ */
337
+ function isFricative (c, lang) {
338
+ const o = getOffset(c, lang)
339
+ return FRICATIVE_LIST.includes(o)
340
+ }
341
+
342
+ /**
343
+ * Is the character an approximant consonant
344
+ * @param {string} c - Character
345
+ * @param {string} lang - Language code
346
+ * @returns {boolean} True if character is an approximant consonant
347
+ */
348
+ function isApproximant (c, lang) {
349
+ const o = getOffset(c, lang)
350
+ return APPROXIMANT_LIST.includes(o)
351
+ }
352
+
353
+ /**
354
+ * Is the character a number
355
+ * @param {string} c - Character
356
+ * @param {string} lang - Language code
357
+ * @returns {boolean} True if character is a number
358
+ */
359
+ function isNumber (c, lang) {
360
+ const o = getOffset(c, lang)
361
+ return o >= 0x66 && o <= 0x6f
362
+ }
363
+
364
+ // Offset-based functions
365
+
366
+ /**
367
+ * Is the offset a vowel
368
+ * @param {number} cOffset - Character offset
369
+ * @returns {boolean} True if offset is a vowel
370
+ */
371
+ function isVowelOffset (cOffset) {
372
+ return cOffset >= 0x04 && cOffset <= 0x14
373
+ }
374
+
375
+ /**
376
+ * Is the offset a vowel sign (maatraa)
377
+ * @param {number} cOffset - Character offset
378
+ * @returns {boolean} True if offset is a vowel sign
379
+ */
380
+ function isVowelSignOffset (cOffset) {
381
+ return cOffset >= 0x3e && cOffset <= 0x4c
382
+ }
383
+
384
+ /**
385
+ * Is the offset the halanta offset
386
+ * @param {number} cOffset - Character offset
387
+ * @returns {boolean} True if offset is halanta
388
+ */
389
+ function isHalantaOffset (cOffset) {
390
+ return cOffset === HALANTA_OFFSET
391
+ }
392
+
393
+ /**
394
+ * Is the offset the nukta offset
395
+ * @param {number} cOffset - Character offset
396
+ * @returns {boolean} True if offset is nukta
397
+ */
398
+ function isNuktaOffset (cOffset) {
399
+ return cOffset === NUKTA_OFFSET
400
+ }
401
+
402
+ /**
403
+ * Is the offset a vowel sign (maatraa)
404
+ * @param {number} cOffset - Character offset
405
+ * @returns {boolean} True if offset is aum
406
+ */
407
+ function isAumOffset (cOffset) {
408
+ return cOffset === AUM_OFFSET
409
+ }
410
+
411
+ /**
412
+ * Is the offset a consonant
413
+ * @param {number} cOffset - Character offset
414
+ * @returns {boolean} True if offset is a consonant
415
+ */
416
+ function isConsonantOffset (cOffset) {
417
+ return cOffset >= 0x15 && cOffset <= 0x39
418
+ }
419
+
420
+ /**
421
+ * Is the offset a velar
422
+ * @param {number} cOffset - Character offset
423
+ * @returns {boolean} True if offset is a velar
424
+ */
425
+ function isVelarOffset (cOffset) {
426
+ return cOffset >= VELAR_RANGE[0] && cOffset <= VELAR_RANGE[1]
427
+ }
428
+
429
+ /**
430
+ * Is the offset a palatal
431
+ * @param {number} cOffset - Character offset
432
+ * @returns {boolean} True if offset is a palatal
433
+ */
434
+ function isPalatalOffset (cOffset) {
435
+ return cOffset >= PALATAL_RANGE[0] && cOffset <= PALATAL_RANGE[1]
436
+ }
437
+
438
+ /**
439
+ * Is the offset a retroflex
440
+ * @param {number} cOffset - Character offset
441
+ * @returns {boolean} True if offset is a retroflex
442
+ */
443
+ function isRetroflexOffset (cOffset) {
444
+ return cOffset >= RETROFLEX_RANGE[0] && cOffset <= RETROFLEX_RANGE[1]
445
+ }
446
+
447
+ /**
448
+ * Is the offset a dental
449
+ * @param {number} cOffset - Character offset
450
+ * @returns {boolean} True if offset is a dental
451
+ */
452
+ function isDentalOffset (cOffset) {
453
+ return cOffset >= DENTAL_RANGE[0] && cOffset <= DENTAL_RANGE[1]
454
+ }
455
+
456
+ /**
457
+ * Is the offset a labial
458
+ * @param {number} cOffset - Character offset
459
+ * @returns {boolean} True if offset is a labial
460
+ */
461
+ function isLabialOffset (cOffset) {
462
+ return cOffset >= LABIAL_RANGE[0] && cOffset <= LABIAL_RANGE[1]
463
+ }
464
+
465
+ /**
466
+ * Is the offset a voiced consonant
467
+ * @param {number} cOffset - Character offset
468
+ * @returns {boolean} True if offset is a voiced consonant
469
+ */
470
+ function isVoicedOffset (cOffset) {
471
+ return VOICED_LIST.includes(cOffset)
472
+ }
473
+
474
+ /**
475
+ * Is the offset an unvoiced consonant
476
+ * @param {number} cOffset - Character offset
477
+ * @returns {boolean} True if offset is an unvoiced consonant
478
+ */
479
+ function isUnvoicedOffset (cOffset) {
480
+ return UNVOICED_LIST.includes(cOffset)
481
+ }
482
+
483
+ /**
484
+ * Is the offset an aspirated consonant
485
+ * @param {number} cOffset - Character offset
486
+ * @returns {boolean} True if offset is an aspirated consonant
487
+ */
488
+ function isAspiratedOffset (cOffset) {
489
+ return ASPIRATED_LIST.includes(cOffset)
490
+ }
491
+
492
+ /**
493
+ * Is the offset an unaspirated consonant
494
+ * @param {number} cOffset - Character offset
495
+ * @returns {boolean} True if offset is an unaspirated consonant
496
+ */
497
+ function isUnaspiratedOffset (cOffset) {
498
+ return UNASPIRATED_LIST.includes(cOffset)
499
+ }
500
+
501
+ /**
502
+ * Is the offset a nasal consonant
503
+ * @param {number} cOffset - Character offset
504
+ * @returns {boolean} True if offset is a nasal consonant
505
+ */
506
+ function isNasalOffset (cOffset) {
507
+ return NASAL_LIST.includes(cOffset)
508
+ }
509
+
510
+ /**
511
+ * Is the offset a fricative consonant
512
+ * @param {number} cOffset - Character offset
513
+ * @returns {boolean} True if offset is a fricative consonant
514
+ */
515
+ function isFricativeOffset (cOffset) {
516
+ return FRICATIVE_LIST.includes(cOffset)
517
+ }
518
+
519
+ /**
520
+ * Is the offset an approximant consonant
521
+ * @param {number} cOffset - Character offset
522
+ * @returns {boolean} True if offset is an approximant consonant
523
+ */
524
+ function isApproximantOffset (cOffset) {
525
+ return APPROXIMANT_LIST.includes(cOffset)
526
+ }
527
+
528
+ /**
529
+ * Is the offset a number
530
+ * @param {number} cOffset - Character offset
531
+ * @returns {boolean} True if offset is a number
532
+ */
533
+ function isNumberOffset (cOffset) {
534
+ return cOffset >= 0x66 && cOffset <= 0x6f
535
+ }
536
+
537
+ module.exports = {
538
+ LC_TA,
539
+ SCRIPT_RANGES,
540
+ DRAVIDIAN_LANGUAGES,
541
+ IE_LANGUAGES,
542
+ DANDA_DELIM_LANGUAGES,
543
+ URDU_RANGES,
544
+ COORDINATED_RANGE_START_INCLUSIVE,
545
+ COORDINATED_RANGE_END_INCLUSIVE,
546
+ NUMERIC_OFFSET_START,
547
+ NUMERIC_OFFSET_END,
548
+ HALANTA_OFFSET,
549
+ AUM_OFFSET,
550
+ NUKTA_OFFSET,
551
+ RUPEE_SIGN,
552
+ DANDA,
553
+ DOUBLE_DANDA,
554
+ VELAR_RANGE,
555
+ PALATAL_RANGE,
556
+ RETROFLEX_RANGE,
557
+ DENTAL_RANGE,
558
+ LABIAL_RANGE,
559
+ VOICED_LIST,
560
+ UNVOICED_LIST,
561
+ ASPIRATED_LIST,
562
+ UNASPIRATED_LIST,
563
+ NASAL_LIST,
564
+ FRICATIVE_LIST,
565
+ APPROXIMANT_LIST,
566
+ isDandaDelim,
567
+ getOffset,
568
+ offsetToChar,
569
+ inCoordinatedRange,
570
+ isIndiclangChar,
571
+ isVowel,
572
+ isVowelSign,
573
+ isHalanta,
574
+ isNukta,
575
+ isAum,
576
+ isConsonant,
577
+ isVelar,
578
+ isPalatal,
579
+ isRetroflex,
580
+ isDental,
581
+ isLabial,
582
+ isVoiced,
583
+ isUnvoiced,
584
+ isAspirated,
585
+ isUnaspirated,
586
+ isNasal,
587
+ isFricative,
588
+ isApproximant,
589
+ isNumber,
590
+ isVowelOffset,
591
+ isVowelSignOffset,
592
+ isHalantaOffset,
593
+ isNuktaOffset,
594
+ isAumOffset,
595
+ isConsonantOffset,
596
+ isVelarOffset,
597
+ isPalatalOffset,
598
+ isRetroflexOffset,
599
+ isDentalOffset,
600
+ isLabialOffset,
601
+ isVoicedOffset,
602
+ isUnvoicedOffset,
603
+ isAspiratedOffset,
604
+ isUnaspiratedOffset,
605
+ isNasalOffset,
606
+ isFricativeOffset,
607
+ isApproximantOffset,
608
+ isNumberOffset
609
+ }