@keymanapp/kmc-model 17.0.85-alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/.nyc_output/coverage-10524-1681239236645-0.json +1 -0
  2. package/Makefile +38 -0
  3. package/build/cjs-src/lexical-model-compiler.cjs +152688 -0
  4. package/build/src/build-trie.d.ts +40 -0
  5. package/build/src/build-trie.d.ts.map +1 -0
  6. package/build/src/build-trie.js +362 -0
  7. package/build/src/build-trie.js.map +1 -0
  8. package/build/src/join-word-breaker-decorator.d.ts +10 -0
  9. package/build/src/join-word-breaker-decorator.d.ts.map +1 -0
  10. package/build/src/join-word-breaker-decorator.js +121 -0
  11. package/build/src/join-word-breaker-decorator.js.map +1 -0
  12. package/build/src/lexical-model-compiler.d.ts +19 -0
  13. package/build/src/lexical-model-compiler.d.ts.map +1 -0
  14. package/build/src/lexical-model-compiler.js +155 -0
  15. package/build/src/lexical-model-compiler.js.map +1 -0
  16. package/build/src/lexical-model.d.ts +135 -0
  17. package/build/src/lexical-model.d.ts.map +1 -0
  18. package/build/src/lexical-model.js +6 -0
  19. package/build/src/lexical-model.js.map +1 -0
  20. package/build/src/main.d.ts +15 -0
  21. package/build/src/main.d.ts.map +1 -0
  22. package/build/src/main.js +46 -0
  23. package/build/src/main.js.map +1 -0
  24. package/build/src/model-compiler-errors.d.ts +77 -0
  25. package/build/src/model-compiler-errors.d.ts.map +1 -0
  26. package/build/src/model-compiler-errors.js +156 -0
  27. package/build/src/model-compiler-errors.js.map +1 -0
  28. package/build/src/model-defaults.d.ts +56 -0
  29. package/build/src/model-defaults.d.ts.map +1 -0
  30. package/build/src/model-defaults.js +106 -0
  31. package/build/src/model-defaults.js.map +1 -0
  32. package/build/src/model-definitions.d.ts +71 -0
  33. package/build/src/model-definitions.d.ts.map +1 -0
  34. package/build/src/model-definitions.js +189 -0
  35. package/build/src/model-definitions.js.map +1 -0
  36. package/build/src/script-overrides-decorator.d.ts +4 -0
  37. package/build/src/script-overrides-decorator.d.ts.map +1 -0
  38. package/build/src/script-overrides-decorator.js +63 -0
  39. package/build/src/script-overrides-decorator.js.map +1 -0
  40. package/build/test/helpers/index.d.ts +69 -0
  41. package/build/test/helpers/index.d.ts.map +1 -0
  42. package/build/test/helpers/index.js +160 -0
  43. package/build/test/helpers/index.js.map +1 -0
  44. package/build/test/test-compile-model-with-pseudoclosure.d.ts +2 -0
  45. package/build/test/test-compile-model-with-pseudoclosure.d.ts.map +1 -0
  46. package/build/test/test-compile-model-with-pseudoclosure.js +200 -0
  47. package/build/test/test-compile-model-with-pseudoclosure.js.map +1 -0
  48. package/build/test/test-compile-model.d.ts +2 -0
  49. package/build/test/test-compile-model.d.ts.map +1 -0
  50. package/build/test/test-compile-model.js +30 -0
  51. package/build/test/test-compile-model.js.map +1 -0
  52. package/build/test/test-compile-trie.d.ts +2 -0
  53. package/build/test/test-compile-trie.d.ts.map +1 -0
  54. package/build/test/test-compile-trie.js +125 -0
  55. package/build/test/test-compile-trie.js.map +1 -0
  56. package/build/test/test-default-apply-case.d.ts +2 -0
  57. package/build/test/test-default-apply-case.d.ts.map +1 -0
  58. package/build/test/test-default-apply-case.js +105 -0
  59. package/build/test/test-default-apply-case.js.map +1 -0
  60. package/build/test/test-default-search-term-to-key.d.ts +2 -0
  61. package/build/test/test-default-search-term-to-key.d.ts.map +1 -0
  62. package/build/test/test-default-search-term-to-key.js +148 -0
  63. package/build/test/test-default-search-term-to-key.js.map +1 -0
  64. package/build/test/test-error-logger.d.ts +2 -0
  65. package/build/test/test-error-logger.d.ts.map +1 -0
  66. package/build/test/test-error-logger.js +26 -0
  67. package/build/test/test-error-logger.js.map +1 -0
  68. package/build/test/test-join-word-breaker.d.ts +2 -0
  69. package/build/test/test-join-word-breaker.d.ts.map +1 -0
  70. package/build/test/test-join-word-breaker.js +84 -0
  71. package/build/test/test-join-word-breaker.js.map +1 -0
  72. package/build/test/test-model-definitions.d.ts +2 -0
  73. package/build/test/test-model-definitions.d.ts.map +1 -0
  74. package/build/test/test-model-definitions.js +165 -0
  75. package/build/test/test-model-definitions.js.map +1 -0
  76. package/build/test/test-override-script-defaults.d.ts +2 -0
  77. package/build/test/test-override-script-defaults.d.ts.map +1 -0
  78. package/build/test/test-override-script-defaults.js +28 -0
  79. package/build/test/test-override-script-defaults.js.map +1 -0
  80. package/build/test/test-parse-wordlist.d.ts +2 -0
  81. package/build/test/test-parse-wordlist.d.ts.map +1 -0
  82. package/build/test/test-parse-wordlist.js +110 -0
  83. package/build/test/test-parse-wordlist.js.map +1 -0
  84. package/build/test/test-punctuation.d.ts +2 -0
  85. package/build/test/test-punctuation.d.ts.map +1 -0
  86. package/build/test/test-punctuation.js +31 -0
  87. package/build/test/test-punctuation.js.map +1 -0
  88. package/build/test/tsconfig.tsbuildinfo +1 -0
  89. package/build/test/wordbreakers/data.d.ts +35 -0
  90. package/build/test/wordbreakers/data.d.ts.map +1 -0
  91. package/build/test/wordbreakers/data.js +1778 -0
  92. package/build/test/wordbreakers/data.js.map +1 -0
  93. package/build/test/wordbreakers/default-wordbreaker-esm.d.ts +10 -0
  94. package/build/test/wordbreakers/default-wordbreaker-esm.d.ts.map +1 -0
  95. package/build/test/wordbreakers/default-wordbreaker-esm.js +354 -0
  96. package/build/test/wordbreakers/default-wordbreaker-esm.js.map +1 -0
  97. package/build/tsconfig.tsbuildinfo +1 -0
  98. package/build.sh +73 -0
  99. package/coverage/lcov-report/base.css +224 -0
  100. package/coverage/lcov-report/block-navigation.js +87 -0
  101. package/coverage/lcov-report/favicon.png +0 -0
  102. package/coverage/lcov-report/index.html +161 -0
  103. package/coverage/lcov-report/prettify.css +1 -0
  104. package/coverage/lcov-report/prettify.js +2 -0
  105. package/coverage/lcov-report/sort-arrow-sprite.png +0 -0
  106. package/coverage/lcov-report/sorter.js +196 -0
  107. package/coverage/lcov-report/src/build-trie.ts.html +1618 -0
  108. package/coverage/lcov-report/src/index.html +221 -0
  109. package/coverage/lcov-report/src/join-word-breaker-decorator.ts.html +487 -0
  110. package/coverage/lcov-report/src/lexical-model-compiler.ts.html +622 -0
  111. package/coverage/lcov-report/src/main.ts.html +271 -0
  112. package/coverage/lcov-report/src/model-compiler-errors.ts.html +691 -0
  113. package/coverage/lcov-report/src/model-defaults.ts.html +415 -0
  114. package/coverage/lcov-report/src/model-definitions.ts.html +748 -0
  115. package/coverage/lcov-report/src/script-overrides-decorator.ts.html +310 -0
  116. package/coverage/lcov-report/test/helpers/index.html +116 -0
  117. package/coverage/lcov-report/test/helpers/index.ts.html +646 -0
  118. package/coverage/lcov-report/test/index.html +266 -0
  119. package/coverage/lcov-report/test/test-compile-model-with-pseudoclosure.ts.html +802 -0
  120. package/coverage/lcov-report/test/test-compile-model.ts.html +187 -0
  121. package/coverage/lcov-report/test/test-compile-trie.ts.html +541 -0
  122. package/coverage/lcov-report/test/test-default-apply-case.ts.html +466 -0
  123. package/coverage/lcov-report/test/test-default-search-term-to-key.ts.html +628 -0
  124. package/coverage/lcov-report/test/test-error-logger.ts.html +196 -0
  125. package/coverage/lcov-report/test/test-join-word-breaker.ts.html +376 -0
  126. package/coverage/lcov-report/test/test-model-definitions.ts.html +676 -0
  127. package/coverage/lcov-report/test/test-override-script-defaults.ts.html +184 -0
  128. package/coverage/lcov-report/test/test-parse-wordlist.ts.html +466 -0
  129. package/coverage/lcov-report/test/test-punctuation.ts.html +190 -0
  130. package/coverage/lcov-report/test/wordbreakers/data.ts.html +5413 -0
  131. package/coverage/lcov-report/test/wordbreakers/default-wordbreaker-esm.ts.html +1234 -0
  132. package/coverage/lcov-report/test/wordbreakers/index.html +131 -0
  133. package/coverage/lcov.info +5969 -0
  134. package/package.json +61 -0
  135. package/src/build-trie.ts +511 -0
  136. package/src/join-word-breaker-decorator.ts +134 -0
  137. package/src/lexical-model-compiler.ts +179 -0
  138. package/src/lexical-model.ts +150 -0
  139. package/src/main.ts +62 -0
  140. package/src/model-compiler-errors.ts +203 -0
  141. package/src/model-defaults.ts +111 -0
  142. package/src/model-definitions.ts +222 -0
  143. package/src/script-overrides-decorator.ts +75 -0
  144. package/test/README.md +15 -0
  145. package/test/fixtures/example.qaa.joinwordbreaker/example.qaa.joinwordbreaker.model.ts +10 -0
  146. package/test/fixtures/example.qaa.joinwordbreaker/wordlist.tsv +3 -0
  147. package/test/fixtures/example.qaa.scriptusesspaces/example.qaa.scriptusesspaces.model.ts +10 -0
  148. package/test/fixtures/example.qaa.scriptusesspaces/wordlist.tsv +8 -0
  149. package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.kmp.json +45 -0
  150. package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.kps +35 -0
  151. package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.ts +6 -0
  152. package/test/fixtures/example.qaa.sencoten/wordlist.tsv +10 -0
  153. package/test/fixtures/example.qaa.smp/example.qaa.smp.model.ts +6 -0
  154. package/test/fixtures/example.qaa.smp/wordlist.tsv +5 -0
  155. package/test/fixtures/example.qaa.trivial/example.qaa.trivial.model.ts +5 -0
  156. package/test/fixtures/example.qaa.trivial/wordlist.tsv +3 -0
  157. package/test/fixtures/example.qaa.utf16be/example.qaa.utf16be.model.ts +5 -0
  158. package/test/fixtures/example.qaa.utf16be/wordlist.txt +0 -0
  159. package/test/fixtures/example.qaa.utf16le/example.qaa.utf16le.model.ts +5 -0
  160. package/test/fixtures/example.qaa.utf16le/wordlist.txt +0 -0
  161. package/test/fixtures/example.qaa.wordbreaker/example.qaa.wordbreaker.model.ts +9 -0
  162. package/test/fixtures/example.qaa.wordbreaker/wordlist.tsv +3 -0
  163. package/test/helpers/index.ts +187 -0
  164. package/test/test-compile-model-with-pseudoclosure.ts +239 -0
  165. package/test/test-compile-model.ts +34 -0
  166. package/test/test-compile-trie.ts +152 -0
  167. package/test/test-default-apply-case.ts +128 -0
  168. package/test/test-default-search-term-to-key.ts +181 -0
  169. package/test/test-error-logger.ts +38 -0
  170. package/test/test-join-word-breaker.ts +97 -0
  171. package/test/test-model-definitions.ts +198 -0
  172. package/test/test-override-script-defaults.ts +33 -0
  173. package/test/test-parse-wordlist.ts +127 -0
  174. package/test/test-punctuation.ts +35 -0
  175. package/test/tsconfig.json +22 -0
  176. package/test/wordbreakers/README.md +3 -0
  177. package/test/wordbreakers/data.ts +1776 -0
  178. package/test/wordbreakers/default-wordbreaker-esm.ts +383 -0
  179. package/tools/create-override-script-regexp.ts +145 -0
  180. package/tsconfig.json +17 -0
@@ -0,0 +1,128 @@
1
+
2
+ import 'mocha';
3
+ import { assert } from 'chai';
4
+ import { defaultApplyCasing } from '../src/model-defaults.js';
5
+
6
+ describe('The default applyCasing() function', function () {
7
+ // // --------
8
+ // // Definitions toward SMP testing.
9
+ // // Ref: https://unicode.org/charts/nameslist/n_1D400.html
10
+ // // Useful for tests related to strings with supplementary pairs.
11
+ // let u = function(code: number): string {
12
+ // var H = Math.floor((code - 0x10000) / 0x400) + 0xD800;
13
+ // var L = (code - 0x10000) % 0x400 + 0xDC00;
14
+
15
+ // return String.fromCharCode(H, L);
16
+ // }
17
+
18
+ // Are there any known default-handled SMP cases?
19
+ // If not... we could make the defaultApplyCase function simpler by not worrying about SMP.
20
+ //
21
+ // let smp_a = u(0x1d5ba); // MATHEMATICAL SANS-SERIF SMALL A
22
+ // let smp_p = u(0x1d5c9);
23
+ // let smp_l = u(0x1d5c5);
24
+ // let smp_e = u(0x1d5be);
25
+
26
+ // let smp_A = u(0x1d5a0); // MATHEMATICAL SANS_SERIF CAPITAL A
27
+ // let smp_P = u(0x1d5af);
28
+ // let smp_L = u(0x1d5ab);
29
+ // let smp_E = u(0x1d5a4);
30
+
31
+ // // Unfortunately... the default JS .toUpperCase() implementation doesn't actually
32
+ // // map the 'SMALL' versions to the 'CAPITAL' versions.
33
+ // // ---------
34
+
35
+ describe('case: \'lower\'', function() {
36
+ const testCases: [string, string][] = [
37
+ // Note: not written the Turkish way. Turns out 'İ'.toLowerCase() decomposes the result,
38
+ // which would have made for a fairly yucky test.
39
+ ['Istanbul', 'istanbul'],
40
+
41
+ // The DEFAULT function is NOT responsible for understanding the Turkish
42
+ // case regarding the lowercasing of:
43
+ // 'I' U+0048 LATIN CAPITAL LETTER I to 'ı' U+0131 LATIN SMALL LETTER DOTLESS I
44
+ // For Turkic languages, the recommendation is to make a
45
+ // custom applyCasing function:
46
+ ['DİYARBAKIR', 'di̇yarbakir'], // The 'i̇' is the decomposed result alluded to for the previous case.
47
+
48
+ // full-width romaji has corresponding capitalized versions:
49
+ ['AESTHETIC', 'aesthetic'],
50
+
51
+ // "skýlos" is Greek for dog 🇬🇷🐶
52
+ // starts with an 's' and ends with an 's'
53
+ // which are DIFFERENT CHARACTERS in lowercased Greek!
54
+ ['ΣΚΥΛΟΣ', 'σκυλος'],
55
+
56
+ // Uncased syntax and numbers should pass through unscathed:
57
+ ['1234.?!', '1234.?!']
58
+ ];
59
+
60
+ for (let [input, expected] of testCases) {
61
+ it(`should lowercase '${input}' as '${expected}'`, function() {
62
+ assert.equal(defaultApplyCasing('lower', input), expected);
63
+ });
64
+ }
65
+ });
66
+
67
+ describe('case: \'upper\'', function() {
68
+ const testCases: [string, string][] = [
69
+ // Inverse of the corresponding 'lower' test.
70
+ ['istanbul', 'ISTANBUL'],
71
+
72
+ // The DEFAULT function is NOT responsible for understanding the Turkish
73
+ // case regarding the uppercasing of:
74
+ // 'ı' U+0131 LATIN SMALL LETTER DOTLESS I to 'I' U+0048 LATIN CAPITAL LETTER I
75
+ // For Turkic languages, the recommendation is to make a
76
+ // custom applyCasing function:
77
+ ['diyarbakır', 'DIYARBAKIR'], // The 'i̇' is the decomposed result alluded to for the previous case.
78
+
79
+ // full-width romaji has corresponding capitalized versions:
80
+ ['aesthetic', 'AESTHETIC'],
81
+
82
+ // "skýlos" is Greek for dog 🇬🇷🐶
83
+ // starts with an 's' and ends with an 's'
84
+ // which are DIFFERENT CHARACTERS in lowercased Greek!
85
+ ['σκυλος', 'ΣΚΥΛΟΣ'],
86
+
87
+ // Uncased syntax and numbers should pass through unscathed:
88
+ ['1234.?!', '1234.?!']
89
+ ];
90
+
91
+ for (let [input, expected] of testCases) {
92
+ it(`should uppercase '${input}' as '${expected}'`, function() {
93
+ assert.equal(defaultApplyCasing('upper', input), expected);
94
+ });
95
+ }
96
+ });
97
+
98
+ describe('case: \'initial\'', function() {
99
+ const testCases: [string, string][] = [
100
+ // Inverse of the corresponding 'lower' test.
101
+ ['istanbul', 'Istanbul'],
102
+
103
+ // The DEFAULT function is NOT responsible for understanding the Turkish
104
+ // case regarding the uppercasing of:
105
+ // 'ı' U+0131 LATIN SMALL LETTER DOTLESS I to 'I' U+0048 LATIN CAPITAL LETTER I
106
+ // For Turkic languages, the recommendation is to make a
107
+ // custom applyCasing function:
108
+ ['diyarbakır', 'Diyarbakır'], // The 'i̇' is the decomposed result alluded to for the previous case.
109
+
110
+ // full-width romaji has corresponding capitalized versions:
111
+ ['aesthetic', 'Aesthetic'],
112
+
113
+ // "skýlos" is Greek for dog 🇬🇷🐶
114
+ // starts with an 's' and ends with an 's'
115
+ // which are DIFFERENT CHARACTERS in lowercased Greek!
116
+ ['σκυλος', 'Σκυλος'],
117
+
118
+ // Uncased syntax and numbers should pass through unscathed:
119
+ ['1234.?!', '1234.?!']
120
+ ];
121
+
122
+ for (let [input, expected] of testCases) {
123
+ it(`should initial-case '${input}' as '${expected}'`, function() {
124
+ assert.equal(defaultApplyCasing('initial', input), expected);
125
+ });
126
+ }
127
+ });
128
+ });
@@ -0,0 +1,181 @@
1
+
2
+ import 'mocha';
3
+ import {assert} from 'chai';
4
+ import { defaultSearchTermToKey,
5
+ defaultCasedSearchTermToKey,
6
+ defaultApplyCasing } from '../src/model-defaults.js';
7
+
8
+
9
+ describe('The default searchTermToKey() function', function () {
10
+ describe('languageUsesCasing: false', function() {
11
+ const testCases: [string, string][] = [
12
+ // "İstanbul" has a U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE.
13
+ // Without any casing operations, only the I should be altered.
14
+ ['İstanbul', 'Istanbul'],
15
+
16
+ // Similarly...
17
+ ['DİYARBAKIR', 'DIYARBAKIR'],
18
+
19
+ // "skýlos" is Greek for dog 🇬🇷🐶
20
+ // starts with an 's' and ends with an 's'
21
+ // which are DIFFERENT CHARACTERS in lowercased Greek!
22
+ ['σκύλος', 'σκυλος'],
23
+ ['ΣΚΥΛΟΣ', 'ΣΚΥΛΟΣ'],
24
+
25
+ // full-width romaji is compatibility-canonical with ASCII characters:
26
+ ['aesthetic', 'aesthetic'],
27
+
28
+ // U+212B ANGSTROM SIGN (Å)
29
+ // U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE (Å)
30
+ // and should both normalize to 'A'
31
+ ['\u212B', 'A'],
32
+ ['\u00C5', 'A'],
33
+
34
+ // We should not fall for U+037E GREEK QUESTION MARK's trolling:
35
+ ['\u037e', ';'],
36
+
37
+ // Test presentational forms of Arabic:
38
+ // U+FE8D ARABIC LETTER ALEF ISOLATED FORM -> U+0627 ARABIC LETTER ALEF
39
+ // U+FEDF ARABIC LETTER LAM INITIAL FORM -> U+0644 ARABIC LETTER LAM
40
+ // U+FED8 ARABIC LETTER QAF MEDIAL FORM -> U+0642 ARABIC LETTER QAF
41
+ // U+FEEC ARABIC LETTER HEH MEDIAL FORM -> U+0647 ARABIC LETTER HEH
42
+ // U+FEEE ARABIC LETTER WAW FINAL FORM -> U+0648 ARABIC LETTER WAW
43
+ // U+FE93 ARABIC LETTER TEH MARBUTA ISOLATED FORM -> U+0629 ARABIC LETTER TEH MARBUTA
44
+ ['\uFE8D\uFEDF\uFED8\uFEEC\uFEEE\uFE93', '\u0627\u0644\u0642\u0647\u0648\u0629'],
45
+
46
+ // Combine both NFKD **AND** knocking off diacritics:
47
+ // U+01C4 LATIN CAPITAL LETTER DZ WITH CARON (DŽ) -> <U+0064, U+007A> (dz)
48
+ ['DŽ', 'DZ'],
49
+ ];
50
+
51
+ for (let [input, expected] of testCases) {
52
+ it(`should normalize '${input}' to '${expected}'`, function() {
53
+ assert.equal(defaultSearchTermToKey(input), expected);
54
+ });
55
+ }
56
+ });
57
+
58
+ describe('languageUsesCasing: true (custom applyCasing, inverts lower- & upper- casing)', function() {
59
+ const testCases: [string, string][] = [
60
+ // The DEFAULT function is NOT responsible for understanding the Turkish
61
+ // distinction between U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and
62
+ // 'I' U+0048 LATIN CAPITAL LETTER I.
63
+ // For Turkic languages, the recommendation is thus to make a
64
+ // custom searchTermToKey function.
65
+ ['İstanbul', 'ISTANBUL'],
66
+ ['DİYARBAKIR', 'DIYARBAKIR'],
67
+
68
+ // "skýlos" is Greek for dog 🇬🇷🐶
69
+ // starts with an 's' and ends with an 's'
70
+ // which are DIFFERENT CHARACTERS in lowercased Greek!
71
+ ['σκύλος', 'ΣΚΥΛΟΣ'],
72
+ ['σκυλοσ', 'ΣΚΥΛΟΣ'],
73
+
74
+ // full-width romaji is compatibility-canonical with ASCII characters:
75
+ ['aesthetic', 'AESTHETIC'],
76
+
77
+ // U+212B ANGSTROM SIGN (Å)
78
+ // U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE (Å)
79
+ // and should both normalize to 'a'
80
+ ['\u212B', 'A'],
81
+ ['\u00C5', 'A'],
82
+
83
+ // We should not fall for U+037E GREEK QUESTION MARK's trolling:
84
+ ['\u037e', ';'],
85
+
86
+ // Test presentational forms of Arabic:
87
+ // U+FE8D ARABIC LETTER ALEF ISOLATED FORM -> U+0627 ARABIC LETTER ALEF
88
+ // U+FEDF ARABIC LETTER LAM INITIAL FORM -> U+0644 ARABIC LETTER LAM
89
+ // U+FED8 ARABIC LETTER QAF MEDIAL FORM -> U+0642 ARABIC LETTER QAF
90
+ // U+FEEC ARABIC LETTER HEH MEDIAL FORM -> U+0647 ARABIC LETTER HEH
91
+ // U+FEEE ARABIC LETTER WAW FINAL FORM -> U+0648 ARABIC LETTER WAW
92
+ // U+FE93 ARABIC LETTER TEH MARBUTA ISOLATED FORM -> U+0629 ARABIC LETTER TEH MARBUTA
93
+ ['\uFE8D\uFEDF\uFED8\uFEEC\uFEEE\uFE93', '\u0627\u0644\u0642\u0647\u0648\u0629'],
94
+
95
+ // Combine both NFKD **AND** knocking off diacritics:
96
+ // U+01C4 LATIN CAPITAL LETTER DZ WITH CARON (DŽ) -> <U+0064, U+007A> (dz)
97
+ ['DŽ', 'DZ'],
98
+ ];
99
+
100
+ // While a Turkish-based test would be nice, Turkish needs custom keying,
101
+ // as U+0130's default handling is... not ideal in Turkish.
102
+ //
103
+ // Instead, we can get a simple-enough test with inverted casing.
104
+ let customCasing = function(caseToApply: CasingForm,
105
+ text: string,
106
+ defaultApplyCasing: CasingFunction): string {
107
+ switch(caseToApply) {
108
+ case 'lower':
109
+ return text.toUpperCase();
110
+ case 'upper':
111
+ return text.toLowerCase();
112
+ case 'initial':
113
+ return customCasing('upper', text.charAt(0), defaultApplyCasing) + text.substr(1);
114
+ default:
115
+ return text;
116
+ }
117
+ }
118
+
119
+ let customCasingClosure = function(caseToApply: CasingForm, text: string): string {
120
+ return customCasing(caseToApply, text, defaultApplyCasing);
121
+ }
122
+
123
+ for (let [input, expected] of testCases) {
124
+ it(`should normalize '${input}' to '${expected}'`, function() {
125
+ assert.equal(defaultCasedSearchTermToKey(input, customCasingClosure as CasingFunction), expected);
126
+ });
127
+ }
128
+ });
129
+
130
+ describe('languageUsesCasing: undefined (legacy 12.0 / 13.0 mode, uses `defaultApplyCasing`)', function() {
131
+ const testCases: [string, string][] = [
132
+ // "İstanbul" has a U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE.
133
+ // This should be lowercased.
134
+ ['İstanbul', 'istanbul'],
135
+
136
+ // The DEFAULT function is NOT responsible for understanding the Turkish
137
+ // case regarding the lowercasing of:
138
+ // 'I' U+0048 LATIN CAPITAL LETTER I to 'ı' U+0131 LATIN SMALL LETTER DOTLESS I
139
+ // For Turkic languages, the recommendation is to make a
140
+ // custom searchTermToKey function:
141
+ ['DİYARBAKIR', 'diyarbakir'],
142
+
143
+ // "skýlos" is Greek for dog 🇬🇷🐶
144
+ // starts with an 's' and ends with an 's'
145
+ // which are DIFFERENT CHARACTERS in lowercased Greek!
146
+ ['σκύλος', 'σκυλος'],
147
+ ['ΣΚΥΛΟΣ', 'σκυλοσ'],
148
+
149
+ // full-width romaji is compatibility-canonical with ASCII characters:
150
+ ['aesthetic', 'aesthetic'],
151
+
152
+ // U+212B ANGSTROM SIGN (Å)
153
+ // U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE (Å)
154
+ // and should both normalize to 'a'
155
+ ['\u212B', 'a'],
156
+ ['\u00C5', 'a'],
157
+
158
+ // We should not fall for U+037E GREEK QUESTION MARK's trolling:
159
+ ['\u037e', ';'],
160
+
161
+ // Test presentational forms of Arabic:
162
+ // U+FE8D ARABIC LETTER ALEF ISOLATED FORM -> U+0627 ARABIC LETTER ALEF
163
+ // U+FEDF ARABIC LETTER LAM INITIAL FORM -> U+0644 ARABIC LETTER LAM
164
+ // U+FED8 ARABIC LETTER QAF MEDIAL FORM -> U+0642 ARABIC LETTER QAF
165
+ // U+FEEC ARABIC LETTER HEH MEDIAL FORM -> U+0647 ARABIC LETTER HEH
166
+ // U+FEEE ARABIC LETTER WAW FINAL FORM -> U+0648 ARABIC LETTER WAW
167
+ // U+FE93 ARABIC LETTER TEH MARBUTA ISOLATED FORM -> U+0629 ARABIC LETTER TEH MARBUTA
168
+ ['\uFE8D\uFEDF\uFED8\uFEEC\uFEEE\uFE93', '\u0627\u0644\u0642\u0647\u0648\u0629'],
169
+
170
+ // Combine both NFKD **AND** knocking off diacritics:
171
+ // U+01C4 LATIN CAPITAL LETTER DZ WITH CARON (DŽ) -> <U+0064, U+007A> (dz)
172
+ ['DŽ', 'dz'],
173
+ ];
174
+
175
+ for (let [input, expected] of testCases) {
176
+ it(`should normalize '${input}' to '${expected}'`, function() {
177
+ assert.equal(defaultCasedSearchTermToKey(input, defaultApplyCasing), expected);
178
+ });
179
+ }
180
+ });
181
+ });
@@ -0,0 +1,38 @@
1
+ import { MAX_MESSAGES, KeymanCompilerError, log } from "../src/model-compiler-errors.js";
2
+ import { LogHoarder } from "./helpers/index.js";
3
+ import { assert } from "chai";
4
+
5
+ describe('error logger', function () {
6
+ beforeEach(function () {
7
+ this.logHoarder = (new LogHoarder).install()
8
+ })
9
+
10
+ afterEach(function () {
11
+ this.logHoarder.uninstall();
12
+ delete this.logHoarder;
13
+ })
14
+
15
+ it('should stop logging messages **after** a maximum', function () {
16
+ for (let i = 0; i < MAX_MESSAGES; i++) {
17
+ log(KeymanCompilerError.CWARN_DuplicateWordInSameFile, "fake error");
18
+ }
19
+
20
+ // We've logged *just enough messages. This error should not be found:
21
+ assert.isFalse(this.logHoarder.hasSeenCode(
22
+ KeymanCompilerError.CWARN_TooManyErrorsOrWarnings
23
+ ));
24
+
25
+ // Log just one too many:
26
+ log(KeymanCompilerError.CWARN_DuplicateWordInSameFile, "fake error");
27
+
28
+ assert.isTrue(this.logHoarder.hasSeenCode(
29
+ KeymanCompilerError.CWARN_TooManyErrorsOrWarnings
30
+ ));
31
+
32
+ // Log a DIFFERENT error -- it should not appear in the log
33
+ log(KeymanCompilerError.CWARN_MixedNormalizationForms, "fake error");
34
+ assert.isFalse(this.logHoarder.hasSeenCode(
35
+ KeymanCompilerError.CWARN_MixedNormalizationForms
36
+ ));
37
+ })
38
+ })
@@ -0,0 +1,97 @@
1
+ import { assert } from "chai";
2
+ import defaultWordBreaker from './wordbreakers/default-wordbreaker-esm.js';
3
+ import {decorateWithJoin} from '../src/join-word-breaker-decorator.js';
4
+
5
+ describe('The join word breaker decorator', function () {
6
+ it('should decorate an existing word breaker', function () {
7
+ let breakWords = decorateWithJoin(defaultWordBreaker, ['-']);
8
+ assert.isFunction(breakWords);
9
+ });
10
+
11
+ const TEST_CASES: [string, string[], string[], string[]][] = [
12
+ /* input, joiners, default breaks, breaks with joins */
13
+ // Original test case from https://github.com/keymanapp/keyman/issues/2753
14
+ ['khui-chhùi', ['-'], ["khui", "-", "chhùi"], ["khui-chhùi"]],
15
+
16
+ // Plains Cree SRO:
17
+ ['ê-kotiskâwêyâhk', ['-'], ['ê', '-', 'kotiskâwêyâhk'], ['ê-kotiskâwêyâhk']],
18
+
19
+ // Edge cases:
20
+
21
+ // Joiner alone:
22
+ ['-', ['-'], ['-'], ['-']],
23
+ // Joiner at the end:
24
+ ['ni-', ['-'], ['ni', '-'], ['ni-']],
25
+ // Joiner at the end:
26
+ ['-ân', ['-'], ['-', 'ân'], ['-ân']],
27
+
28
+ // This was my guiding test case:
29
+ [
30
+ "-yâhk ê-nitawi-kotiskâwêyâhk ni-",
31
+ ["-"],
32
+ ["-", "yâhk", "ê", "-", "nitawi", "-", "kotiskâwêyâhk", "ni", "-"],
33
+ ["-yâhk", "ê-nitawi-kotiskâwêyâhk", "ni-"]
34
+ ],
35
+
36
+ // Do not perform any joins:
37
+ ["hello world", ["-"], ["hello", "world"], ["hello", "world"]],
38
+
39
+ // Joining using multiple joiners
40
+ [
41
+ "Email: no-body@example.com",
42
+ ["@", "-"],
43
+ ["Email", ":", "no", "-", "body", "@", "example.com"],
44
+ ["Email", ":", "no-body@example.com"]
45
+ ],
46
+
47
+ // Joining with two or more joiners in a row
48
+ [
49
+ "nobody@@example.com",
50
+ ["@"],
51
+ ["nobody", "@", "@", "example.com"],
52
+ ["nobody@@example.com"]
53
+ ],
54
+
55
+ // it should NOT join non-contiguous spans:
56
+ [
57
+ "this- is -bad",
58
+ ["-"],
59
+ ["this", "-", "is", "-", "bad"],
60
+ ["this-", "is", "-bad"]
61
+ ],
62
+
63
+ // different but adjacent joiners
64
+ [
65
+ "I made the kawé:-conjugator.",
66
+ ["-", ":"],
67
+ ["I", "made", "the", "kawé", ":", "-", "conjugator", "."],
68
+ ["I", "made", "the", "kawé:-conjugator", "."]
69
+ ],
70
+
71
+ // 3+ joiners in a row
72
+ [
73
+ // NB: – is U+2001 EN DASH
74
+ "This language is nut–=💠¤~ty!",
75
+ ["~", "–", "¤", "=", "💠"],
76
+ ["This", "language", "is", "nut", "–", "=", "💠", "¤", "~", "ty", "!"],
77
+ ["This", "language", "is", "nut–=💠¤~ty", "!"],
78
+ ],
79
+ ]
80
+
81
+ for (let [phrase, joiners, unjoined, expected] of TEST_CASES) {
82
+ it(`should break «${[phrase]}» as [${expected.join(' ;; ')}]`, function () {
83
+ let breakWords = decorateWithJoin(defaultWordBreaker, joiners);
84
+ let unjoinedResult = defaultWordBreaker(phrase).map(onlyText);
85
+ assert.deepEqual(unjoinedResult, unjoined);
86
+ let actualResult = breakWords(phrase).map(onlyText);
87
+ assert.deepEqual(actualResult, expected);
88
+ });
89
+ }
90
+
91
+ /**
92
+ * Get just the text from a span.
93
+ */
94
+ function onlyText(span: Span) {
95
+ return span.text;
96
+ }
97
+ });
@@ -0,0 +1,198 @@
1
+ import 'mocha';
2
+ import { assert } from 'chai';
3
+ import { ModelDefinitions } from '../src/model-definitions.js';
4
+ import { LexicalModelSource } from '../src/lexical-model.js';
5
+
6
+ describe('Model definition pseudoclosures', function () {
7
+ describe('14.0 defaults', function() {
8
+ describe('languageUsesCasing == true', function() {
9
+ // We don't need a complete spec for this, given the (currently) limited range of what
10
+ // the ModelPseudoclosure covers.
11
+ let modelSource: LexicalModelSource = {
12
+ languageUsesCasing: true,
13
+ sources: [],
14
+ format: 'trie-1.0'
15
+ };
16
+
17
+ let pseudoclosure = new ModelDefinitions(modelSource);
18
+
19
+ const testCases: [string, string, string][] = [
20
+ // Note: not written the Turkish way. Turns out 'İ'.toLowerCase() decomposes the result,
21
+ // which would have made for a fairly yucky test.
22
+ ['Istanbul', 'istanbul', 'istanbul'],
23
+
24
+ // The DEFAULT function is NOT responsible for understanding the Turkish
25
+ // case regarding the lowercasing of:
26
+ // 'I' U+0048 LATIN CAPITAL LETTER I to 'ı' U+0131 LATIN SMALL LETTER DOTLESS I
27
+ // For Turkic languages, the recommendation is to make a
28
+ // custom applyCasing function:
29
+ ['DİYARBAKIR', 'di̇yarbakir', 'diyarbakir'], // The 'i̇' is the decomposed result alluded to for the previous case.
30
+
31
+ // full-width romaji has corresponding lowercased versions:
32
+ ['AESTHETIC', 'aesthetic', 'aesthetic'],
33
+
34
+ // "skýlos" is Greek for dog 🇬🇷🐶
35
+ // starts with an 's' and ends with an 's'
36
+ // which are DIFFERENT CHARACTERS in lowercased Greek!
37
+ ['σκύλος', 'σκύλος', 'σκυλος'],
38
+ ['ΣΚΥΛΟΣ', 'σκυλος', 'σκυλοσ'], // the keyed version after lowercasing doesn't know how
39
+ // to make the distinction. Both 'Σ's have the same char-code.
40
+
41
+ // Uncased syntax and numbers should pass through unscathed:
42
+ ['1234.?!', '1234.?!', '1234.?!'],
43
+ ['”', '”', '"'],
44
+ ["‘", "‘", "'"]
45
+ ];
46
+
47
+ for (let [input, cased, keyed] of testCases) {
48
+ it(`should case '${input}' as '${cased}'`, function() {
49
+ assert.equal(pseudoclosure.applyCasing('lower', input), cased);
50
+ });
51
+
52
+ it(`should key '${input}' as '${keyed}'`, function() {
53
+ assert.equal(pseudoclosure.searchTermToKey(input), keyed);
54
+ });
55
+ }
56
+ });
57
+
58
+ describe('languageUsesCasing == false', function() {
59
+ // We don't need a complete spec for this, given the (currently) limited range of what
60
+ // the ModelPseudoclosure covers.
61
+ let modelSource: LexicalModelSource = {
62
+ languageUsesCasing: false,
63
+ sources: [],
64
+ format: 'trie-1.0'
65
+ };
66
+
67
+ let pseudoclosure = new ModelDefinitions(modelSource);
68
+
69
+ const testCases: [string, string][] = [
70
+ // Note: not written the Turkish way. Turns out 'İ'.toLowerCase() decomposes the result,
71
+ // which would have made for a fairly yucky test.
72
+ ['Istanbul', 'Istanbul'],
73
+
74
+ ['DİYARBAKIR', 'DIYARBAKIR'],
75
+
76
+ // full-width romaji has corresponding capitalized versions:
77
+ ['AESTHETIC', 'AESTHETIC'],
78
+
79
+ // "skýlos" is Greek for dog 🇬🇷🐶
80
+ // starts with an 's' and ends with an 's'
81
+ // which are DIFFERENT CHARACTERS in lowercased Greek!
82
+ ['σκύλος', 'σκυλος'],
83
+ ['ΣΚΥΛΟΣ', 'ΣΚΥΛΟΣ'], // the keyed version after lowercasing doesn't know how
84
+ // to make the distinction. Both 'Σ's have the same char-code.
85
+
86
+ // Uncased syntax and numbers should pass through unscathed:
87
+ ['1234.?!', '1234.?!'],
88
+
89
+ ['”', '"'],
90
+ ["‘", "'"]
91
+ ];
92
+
93
+ for (let [input, keyed] of testCases) {
94
+ it(`should key '${input}' as '${keyed}'`, function() {
95
+ assert.equal(pseudoclosure.searchTermToKey(input), keyed);
96
+ });
97
+ }
98
+ });
99
+ });
100
+ describe('Pre 14.0 defaults (languageUsesCasing == undefined)', function() {
101
+ // We don't need a complete spec for this, given the (currently) limited range of what
102
+ // the ModelPseudoclosure covers.
103
+ let modelSource: LexicalModelSource = {
104
+ sources: [],
105
+ format: 'trie-1.0'
106
+ };
107
+
108
+ let pseudoclosure = new ModelDefinitions(modelSource);
109
+
110
+ const testCases: [string, string][] = [
111
+ // Note: not written the Turkish way. Turns out 'İ'.toLowerCase() decomposes the result,
112
+ // which would have made for a fairly yucky test.
113
+ ['Istanbul', 'istanbul'],
114
+
115
+ ['DİYARBAKIR', 'diyarbakir'],
116
+
117
+ // full-width romaji has corresponding capitalized versions:
118
+ ['AESTHETIC', 'aesthetic'],
119
+
120
+ // "skýlos" is Greek for dog 🇬🇷🐶
121
+ // starts with an 's' and ends with an 's'
122
+ // which are DIFFERENT CHARACTERS in lowercased Greek!
123
+ ['σκύλος', 'σκυλος'],
124
+ ['ΣΚΥΛΟΣ', 'σκυλοσ'], // the keyed version after lowercasing doesn't know how
125
+ // to make the distinction. Both 'Σ's have the same char-code.
126
+
127
+ // Uncased syntax and numbers should pass through unscathed:
128
+ ['1234.?!', '1234.?!']
129
+ ];
130
+
131
+ for (let [input, keyed] of testCases) {
132
+ it(`should key '${input}' as '${keyed}'`, function() {
133
+ assert.equal(pseudoclosure.searchTermToKey(input), keyed);
134
+ });
135
+ }
136
+ });
137
+
138
+ describe('Model-defined applyCasing + (dependent) searchTermToKey', function() {
139
+ // Note: this test only implements enough Turkish-related stuff to facilitate
140
+ // a functional test. Not guaranteed to be sufficient for actual Turkish use.
141
+ let turkishCasing = function(form: CasingForm, text: string, defaultApplyCasing: (form: CasingForm, text: string) => string): string {
142
+ switch(form) {
143
+ case 'lower':
144
+ return defaultApplyCasing(form, text
145
+ .replace(/I/g, 'ı')
146
+ .replace(/İ/g, 'i'));
147
+ case 'upper':
148
+ return defaultApplyCasing(form, text
149
+ .replace(/ı/g, 'I')
150
+ .replace(/i/g, 'İ'));
151
+ case 'initial':
152
+ return turkishCasing('upper', text.charAt(0), defaultApplyCasing) + text.substr(1);
153
+ default:
154
+ return text;
155
+ }
156
+ }
157
+
158
+ let modelSource: LexicalModelSource = {
159
+ languageUsesCasing: true,
160
+ applyCasing: turkishCasing,
161
+ searchTermToKey: function(wordform: string, applyCasing: CasingFunction): string {
162
+ return Array.from(wordform
163
+ .normalize('NFC') // Mostly to avoid decomposing 'İ'
164
+ ) // end of `Array.from`
165
+ .map(function(c) { return applyCasing('lower', c)}) // Will use custom `applyCasing` definition!
166
+ .join('');
167
+ },
168
+ sources: [],
169
+ format: 'trie-1.0'
170
+ };
171
+
172
+ let pseudoclosure = new ModelDefinitions(modelSource);
173
+
174
+ const testCases: [string, string, string][] = [
175
+ ['İstanbul', 'istanbul', 'istanbul'],
176
+
177
+ // The DEFAULT function is NOT responsible for understanding the Turkish
178
+ // case regarding the lowercasing of:
179
+ // 'I' U+0048 LATIN CAPITAL LETTER I to 'ı' U+0131 LATIN SMALL LETTER DOTLESS I
180
+ // For Turkic languages, the recommendation is to make a
181
+ // custom applyCasing function:
182
+ ['DİYARBAKIR', 'diyarbakır', 'diyarbakır'],
183
+
184
+ // Uncased syntax and numbers should pass through unscathed:
185
+ ['1234.?!', '1234.?!', '1234.?!']
186
+ ];
187
+
188
+ for (let [input, cased, keyed] of testCases) {
189
+ it(`should case '${input}' as '${cased}'`, function() {
190
+ assert.equal(pseudoclosure.applyCasing('lower', input), cased);
191
+ });
192
+
193
+ it(`should key '${input}' as '${keyed}'`, function() {
194
+ assert.equal(pseudoclosure.searchTermToKey(input), keyed);
195
+ });
196
+ }
197
+ });
198
+ });
@@ -0,0 +1,33 @@
1
+ import { assert } from "chai";
2
+ import defaultWordBreaker from './wordbreakers/default-wordbreaker-esm.js';
3
+ import {decorateWithScriptOverrides} from '../src/script-overrides-decorator.js';
4
+
5
+ const THIN_SPACE = "\u2009";
6
+
7
+ describe('The script overrides word breaker decorator', function () {
8
+ it('should decorate an existing word breaker', function () {
9
+ let breakWords = decorateWithScriptOverrides(defaultWordBreaker, 'break-words-at-spaces')
10
+ assert.isFunction(breakWords);
11
+ });
12
+
13
+ // I do not read this script or language so I have NO idea what this says
14
+ // ¯\_(ツ)_/¯
15
+ const phraseComponents = ["ຈາກ", THIN_SPACE, "ກ໌ນິ", THIN_SPACE, "ສນາ", THIN_SPACE, "ເກ໌າະ", THIN_SPACE, "ຢັອຫ", THIN_SPACE, "ລະ", THIN_SPACE, "ບຣອມ", THIN_SPACE, "ເຢາະ", ","];
16
+ const phraseSpans = phraseComponents.filter(span => span !== THIN_SPACE);
17
+ const phrase = phraseComponents.join("");
18
+ const expectedNumSpans = phraseSpans.length;
19
+
20
+ it(`should break «${[phrase]}» as ${expectedNumSpans} spans`, function () {
21
+ let breakWords = decorateWithScriptOverrides(defaultWordBreaker, 'break-words-at-spaces');
22
+ let defaultResult = defaultWordBreaker(phrase);
23
+
24
+ assert.isAbove(defaultResult.length, expectedNumSpans);
25
+ let actualResult = breakWords(phrase);
26
+ assert.lengthOf(actualResult, expectedNumSpans);
27
+ assert.deepEqual(actualResult.map(grabText), phraseSpans);
28
+ });
29
+
30
+ function grabText(span: Span) {
31
+ return span.text;
32
+ }
33
+ });