@keymanapp/kmc-model 17.0.85-alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/.nyc_output/coverage-10524-1681239236645-0.json +1 -0
  2. package/Makefile +38 -0
  3. package/build/cjs-src/lexical-model-compiler.cjs +152688 -0
  4. package/build/src/build-trie.d.ts +40 -0
  5. package/build/src/build-trie.d.ts.map +1 -0
  6. package/build/src/build-trie.js +362 -0
  7. package/build/src/build-trie.js.map +1 -0
  8. package/build/src/join-word-breaker-decorator.d.ts +10 -0
  9. package/build/src/join-word-breaker-decorator.d.ts.map +1 -0
  10. package/build/src/join-word-breaker-decorator.js +121 -0
  11. package/build/src/join-word-breaker-decorator.js.map +1 -0
  12. package/build/src/lexical-model-compiler.d.ts +19 -0
  13. package/build/src/lexical-model-compiler.d.ts.map +1 -0
  14. package/build/src/lexical-model-compiler.js +155 -0
  15. package/build/src/lexical-model-compiler.js.map +1 -0
  16. package/build/src/lexical-model.d.ts +135 -0
  17. package/build/src/lexical-model.d.ts.map +1 -0
  18. package/build/src/lexical-model.js +6 -0
  19. package/build/src/lexical-model.js.map +1 -0
  20. package/build/src/main.d.ts +15 -0
  21. package/build/src/main.d.ts.map +1 -0
  22. package/build/src/main.js +46 -0
  23. package/build/src/main.js.map +1 -0
  24. package/build/src/model-compiler-errors.d.ts +77 -0
  25. package/build/src/model-compiler-errors.d.ts.map +1 -0
  26. package/build/src/model-compiler-errors.js +156 -0
  27. package/build/src/model-compiler-errors.js.map +1 -0
  28. package/build/src/model-defaults.d.ts +56 -0
  29. package/build/src/model-defaults.d.ts.map +1 -0
  30. package/build/src/model-defaults.js +106 -0
  31. package/build/src/model-defaults.js.map +1 -0
  32. package/build/src/model-definitions.d.ts +71 -0
  33. package/build/src/model-definitions.d.ts.map +1 -0
  34. package/build/src/model-definitions.js +189 -0
  35. package/build/src/model-definitions.js.map +1 -0
  36. package/build/src/script-overrides-decorator.d.ts +4 -0
  37. package/build/src/script-overrides-decorator.d.ts.map +1 -0
  38. package/build/src/script-overrides-decorator.js +63 -0
  39. package/build/src/script-overrides-decorator.js.map +1 -0
  40. package/build/test/helpers/index.d.ts +69 -0
  41. package/build/test/helpers/index.d.ts.map +1 -0
  42. package/build/test/helpers/index.js +160 -0
  43. package/build/test/helpers/index.js.map +1 -0
  44. package/build/test/test-compile-model-with-pseudoclosure.d.ts +2 -0
  45. package/build/test/test-compile-model-with-pseudoclosure.d.ts.map +1 -0
  46. package/build/test/test-compile-model-with-pseudoclosure.js +200 -0
  47. package/build/test/test-compile-model-with-pseudoclosure.js.map +1 -0
  48. package/build/test/test-compile-model.d.ts +2 -0
  49. package/build/test/test-compile-model.d.ts.map +1 -0
  50. package/build/test/test-compile-model.js +30 -0
  51. package/build/test/test-compile-model.js.map +1 -0
  52. package/build/test/test-compile-trie.d.ts +2 -0
  53. package/build/test/test-compile-trie.d.ts.map +1 -0
  54. package/build/test/test-compile-trie.js +125 -0
  55. package/build/test/test-compile-trie.js.map +1 -0
  56. package/build/test/test-default-apply-case.d.ts +2 -0
  57. package/build/test/test-default-apply-case.d.ts.map +1 -0
  58. package/build/test/test-default-apply-case.js +105 -0
  59. package/build/test/test-default-apply-case.js.map +1 -0
  60. package/build/test/test-default-search-term-to-key.d.ts +2 -0
  61. package/build/test/test-default-search-term-to-key.d.ts.map +1 -0
  62. package/build/test/test-default-search-term-to-key.js +148 -0
  63. package/build/test/test-default-search-term-to-key.js.map +1 -0
  64. package/build/test/test-error-logger.d.ts +2 -0
  65. package/build/test/test-error-logger.d.ts.map +1 -0
  66. package/build/test/test-error-logger.js +26 -0
  67. package/build/test/test-error-logger.js.map +1 -0
  68. package/build/test/test-join-word-breaker.d.ts +2 -0
  69. package/build/test/test-join-word-breaker.d.ts.map +1 -0
  70. package/build/test/test-join-word-breaker.js +84 -0
  71. package/build/test/test-join-word-breaker.js.map +1 -0
  72. package/build/test/test-model-definitions.d.ts +2 -0
  73. package/build/test/test-model-definitions.d.ts.map +1 -0
  74. package/build/test/test-model-definitions.js +165 -0
  75. package/build/test/test-model-definitions.js.map +1 -0
  76. package/build/test/test-override-script-defaults.d.ts +2 -0
  77. package/build/test/test-override-script-defaults.d.ts.map +1 -0
  78. package/build/test/test-override-script-defaults.js +28 -0
  79. package/build/test/test-override-script-defaults.js.map +1 -0
  80. package/build/test/test-parse-wordlist.d.ts +2 -0
  81. package/build/test/test-parse-wordlist.d.ts.map +1 -0
  82. package/build/test/test-parse-wordlist.js +110 -0
  83. package/build/test/test-parse-wordlist.js.map +1 -0
  84. package/build/test/test-punctuation.d.ts +2 -0
  85. package/build/test/test-punctuation.d.ts.map +1 -0
  86. package/build/test/test-punctuation.js +31 -0
  87. package/build/test/test-punctuation.js.map +1 -0
  88. package/build/test/tsconfig.tsbuildinfo +1 -0
  89. package/build/test/wordbreakers/data.d.ts +35 -0
  90. package/build/test/wordbreakers/data.d.ts.map +1 -0
  91. package/build/test/wordbreakers/data.js +1778 -0
  92. package/build/test/wordbreakers/data.js.map +1 -0
  93. package/build/test/wordbreakers/default-wordbreaker-esm.d.ts +10 -0
  94. package/build/test/wordbreakers/default-wordbreaker-esm.d.ts.map +1 -0
  95. package/build/test/wordbreakers/default-wordbreaker-esm.js +354 -0
  96. package/build/test/wordbreakers/default-wordbreaker-esm.js.map +1 -0
  97. package/build/tsconfig.tsbuildinfo +1 -0
  98. package/build.sh +73 -0
  99. package/coverage/lcov-report/base.css +224 -0
  100. package/coverage/lcov-report/block-navigation.js +87 -0
  101. package/coverage/lcov-report/favicon.png +0 -0
  102. package/coverage/lcov-report/index.html +161 -0
  103. package/coverage/lcov-report/prettify.css +1 -0
  104. package/coverage/lcov-report/prettify.js +2 -0
  105. package/coverage/lcov-report/sort-arrow-sprite.png +0 -0
  106. package/coverage/lcov-report/sorter.js +196 -0
  107. package/coverage/lcov-report/src/build-trie.ts.html +1618 -0
  108. package/coverage/lcov-report/src/index.html +221 -0
  109. package/coverage/lcov-report/src/join-word-breaker-decorator.ts.html +487 -0
  110. package/coverage/lcov-report/src/lexical-model-compiler.ts.html +622 -0
  111. package/coverage/lcov-report/src/main.ts.html +271 -0
  112. package/coverage/lcov-report/src/model-compiler-errors.ts.html +691 -0
  113. package/coverage/lcov-report/src/model-defaults.ts.html +415 -0
  114. package/coverage/lcov-report/src/model-definitions.ts.html +748 -0
  115. package/coverage/lcov-report/src/script-overrides-decorator.ts.html +310 -0
  116. package/coverage/lcov-report/test/helpers/index.html +116 -0
  117. package/coverage/lcov-report/test/helpers/index.ts.html +646 -0
  118. package/coverage/lcov-report/test/index.html +266 -0
  119. package/coverage/lcov-report/test/test-compile-model-with-pseudoclosure.ts.html +802 -0
  120. package/coverage/lcov-report/test/test-compile-model.ts.html +187 -0
  121. package/coverage/lcov-report/test/test-compile-trie.ts.html +541 -0
  122. package/coverage/lcov-report/test/test-default-apply-case.ts.html +466 -0
  123. package/coverage/lcov-report/test/test-default-search-term-to-key.ts.html +628 -0
  124. package/coverage/lcov-report/test/test-error-logger.ts.html +196 -0
  125. package/coverage/lcov-report/test/test-join-word-breaker.ts.html +376 -0
  126. package/coverage/lcov-report/test/test-model-definitions.ts.html +676 -0
  127. package/coverage/lcov-report/test/test-override-script-defaults.ts.html +184 -0
  128. package/coverage/lcov-report/test/test-parse-wordlist.ts.html +466 -0
  129. package/coverage/lcov-report/test/test-punctuation.ts.html +190 -0
  130. package/coverage/lcov-report/test/wordbreakers/data.ts.html +5413 -0
  131. package/coverage/lcov-report/test/wordbreakers/default-wordbreaker-esm.ts.html +1234 -0
  132. package/coverage/lcov-report/test/wordbreakers/index.html +131 -0
  133. package/coverage/lcov.info +5969 -0
  134. package/package.json +61 -0
  135. package/src/build-trie.ts +511 -0
  136. package/src/join-word-breaker-decorator.ts +134 -0
  137. package/src/lexical-model-compiler.ts +179 -0
  138. package/src/lexical-model.ts +150 -0
  139. package/src/main.ts +62 -0
  140. package/src/model-compiler-errors.ts +203 -0
  141. package/src/model-defaults.ts +111 -0
  142. package/src/model-definitions.ts +222 -0
  143. package/src/script-overrides-decorator.ts +75 -0
  144. package/test/README.md +15 -0
  145. package/test/fixtures/example.qaa.joinwordbreaker/example.qaa.joinwordbreaker.model.ts +10 -0
  146. package/test/fixtures/example.qaa.joinwordbreaker/wordlist.tsv +3 -0
  147. package/test/fixtures/example.qaa.scriptusesspaces/example.qaa.scriptusesspaces.model.ts +10 -0
  148. package/test/fixtures/example.qaa.scriptusesspaces/wordlist.tsv +8 -0
  149. package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.kmp.json +45 -0
  150. package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.kps +35 -0
  151. package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.ts +6 -0
  152. package/test/fixtures/example.qaa.sencoten/wordlist.tsv +10 -0
  153. package/test/fixtures/example.qaa.smp/example.qaa.smp.model.ts +6 -0
  154. package/test/fixtures/example.qaa.smp/wordlist.tsv +5 -0
  155. package/test/fixtures/example.qaa.trivial/example.qaa.trivial.model.ts +5 -0
  156. package/test/fixtures/example.qaa.trivial/wordlist.tsv +3 -0
  157. package/test/fixtures/example.qaa.utf16be/example.qaa.utf16be.model.ts +5 -0
  158. package/test/fixtures/example.qaa.utf16be/wordlist.txt +0 -0
  159. package/test/fixtures/example.qaa.utf16le/example.qaa.utf16le.model.ts +5 -0
  160. package/test/fixtures/example.qaa.utf16le/wordlist.txt +0 -0
  161. package/test/fixtures/example.qaa.wordbreaker/example.qaa.wordbreaker.model.ts +9 -0
  162. package/test/fixtures/example.qaa.wordbreaker/wordlist.tsv +3 -0
  163. package/test/helpers/index.ts +187 -0
  164. package/test/test-compile-model-with-pseudoclosure.ts +239 -0
  165. package/test/test-compile-model.ts +34 -0
  166. package/test/test-compile-trie.ts +152 -0
  167. package/test/test-default-apply-case.ts +128 -0
  168. package/test/test-default-search-term-to-key.ts +181 -0
  169. package/test/test-error-logger.ts +38 -0
  170. package/test/test-join-word-breaker.ts +97 -0
  171. package/test/test-model-definitions.ts +198 -0
  172. package/test/test-override-script-defaults.ts +33 -0
  173. package/test/test-parse-wordlist.ts +127 -0
  174. package/test/test-punctuation.ts +35 -0
  175. package/test/tsconfig.json +22 -0
  176. package/test/wordbreakers/README.md +3 -0
  177. package/test/wordbreakers/data.ts +1776 -0
  178. package/test/wordbreakers/default-wordbreaker-esm.ts +383 -0
  179. package/tools/create-override-script-regexp.ts +145 -0
  180. package/tsconfig.json +17 -0
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Converts wordforms into an indexable form. It does this by
3
+ * normalizing the letter case of characters INDIVIDUALLY (to disregard
4
+ * context-sensitive case transformations), normalizing to NFKD form,
5
+ * and removing common diacritical marks.
6
+ *
7
+ * This is a very speculative implementation, that might work with
8
+ * your language. We don't guarantee that this will be perfect for your
9
+ * language, but it's a start.
10
+ *
11
+ * This uses String.prototype.normalize() to convert normalize into NFKD.
12
+ * NFKD neutralizes some funky distinctions, e.g., ꬲ, e, e should all be the
13
+ * same character; plus, it's an easy way to separate a Latin character from
14
+ * its diacritics; Even then, orthographies regularly use code points
15
+ * that, under NFKD normalization, do NOT decompose appropriately for your
16
+ * language (e.g., SENĆOŦEN, Plains Cree in syllabics).
17
+ *
18
+ * Use this in early iterations of the model. For a production lexical model,
19
+ * you will probably write/generate your own key function, tailored to your
20
+ * language. There is a chance the default will work properly out of the box.
21
+ */
22
+ export function defaultSearchTermToKey(wordform: string): string {
23
+ return wordform
24
+ .normalize('NFKD')
25
+ // Remove any combining diacritics (if input is in NFKD)
26
+ .replace(/[\u0300-\u036F]/g, '')
27
+ // Replace directional quotation marks with plain apostrophes
28
+ .replace(/‘/, "'")
29
+ .replace(/’/, "'")
30
+ // Also double-quote marks.
31
+ .replace(/“/, '"')
32
+ .replace(/”/, '"');
33
+ }
34
+
35
+ /**
36
+ * Converts wordforms into an indexable form. It does this by
37
+ * normalizing the letter case of characters INDIVIDUALLY (to disregard
38
+ * context-sensitive case transformations), normalizing to NFKD form,
39
+ * and removing common diacritical marks.
40
+ *
41
+ * This is a very speculative implementation, that might work with
42
+ * your language. We don't guarantee that this will be perfect for your
43
+ * language, but it's a start.
44
+ *
45
+ * This uses String.prototype.normalize() to convert normalize into NFKD.
46
+ * NFKD neutralizes some funky distinctions, e.g., ꬲ, e, e should all be the
47
+ * same character; plus, it's an easy way to separate a Latin character from
48
+ * its diacritics; Even then, orthographies regularly use code points
49
+ * that, under NFKD normalization, do NOT decompose appropriately for your
50
+ * language (e.g., SENĆOŦEN, Plains Cree in syllabics).
51
+ *
52
+ * Use this in early iterations of the model. For a production lexical model,
53
+ * you will probably write/generate your own key function, tailored to your
54
+ * language. There is a chance the default will work properly out of the box.
55
+ */
56
+ export function defaultCasedSearchTermToKey(wordform: string, applyCasing: CasingFunction): string {
57
+ // While this is a bit WET, as the basic `defaultSearchTermToKey` exists and performs some of
58
+ // the same functions, repetition is the easiest way to allow the function to be safely compiled
59
+ // with ease by use of `.toString()`.
60
+ return Array.from(wordform
61
+ .normalize('NFKD')
62
+ // Remove any combining diacritics (if input is in NFKD)
63
+ .replace(/[\u0300-\u036F]/g, '')
64
+ ) // end of `Array.from`
65
+ .map(function(c) { return applyCasing('lower', c)})
66
+ .join('')
67
+ // Replace directional quotation marks with plain apostrophes
68
+ .replace(/‘/, "'")
69
+ .replace(/’/, "'")
70
+ // Also double-quote marks.
71
+ .replace(/“/, '"')
72
+ .replace(/”/, '"');
73
+ }
74
+
75
+ /**
76
+ * Specifies default casing behavior for lexical models when `languageUsesCasing` is
77
+ * set to true.
78
+ * @param casing One of 'lower' (lowercased), 'upper' (uppercased), or 'initial'.
79
+ *
80
+ * 'initial' is designed to cover cases like sentence-initial & proper noun capitalization in English.
81
+ * This may be overwritten as appropriate in model-specific implementations.
82
+ * @param text The text to be modified.
83
+ */
84
+ export function defaultApplyCasing(casing: CasingForm, text: string): string {
85
+ switch(casing) {
86
+ case 'lower':
87
+ return text.toLowerCase();
88
+ case 'upper':
89
+ return text.toUpperCase();
90
+ case 'initial':
91
+ var headCode = text.charCodeAt(0);
92
+ // The length of the first code unit, as measured in code points.
93
+ var headUnitLength = 1;
94
+
95
+ // Is the first character a high surrogate, indicating possible use of UTF-16
96
+ // surrogate pairs? Also, is the string long enough for there to BE a pair?
97
+ if(text.length > 1 && headCode >= 0xD800 && headCode <= 0xDBFF) {
98
+ // It's possible, so now we check for low surrogates.
99
+ var lowSurrogateCode = text.charCodeAt(1);
100
+
101
+ if(lowSurrogateCode >= 0xDC00 && lowSurrogateCode <= 0xDFFF) {
102
+ // We have a surrogate pair; this pair is the 'first' character.
103
+ headUnitLength++;
104
+ }
105
+ }
106
+
107
+ // Capitalizes the first code unit of the string, leaving the rest intact.
108
+ return text.substring(0, headUnitLength).toUpperCase() // head - uppercased
109
+ .concat(text.substring(headUnitLength)); // tail - lowercased
110
+ }
111
+ }
@@ -0,0 +1,222 @@
1
+ import { defaultApplyCasing,
2
+ defaultCasedSearchTermToKey,
3
+ defaultSearchTermToKey
4
+ } from "./model-defaults.js";
5
+
6
+ import KEYMAN_VERSION from "@keymanapp/keyman-version";
7
+ import { LexicalModelSource, WordformToKeySpec } from "./lexical-model.js";
8
+
9
+ /**
10
+ * Processes certain defined model behaviors in such a way that the needed closures
11
+ * may be safely compiled to a JS file and loaded within the LMLayer.
12
+ *
13
+ * This is accomplished by writing out a 'pseudoclosure' within the model's IIFE,
14
+ * then used to build _actual_ closures at LMLayer load time. This 'pseudoclosure'
15
+ * will very closely match the organizational patterns of this class in order to
16
+ * facilitate the maintenance of this approach.
17
+ */
18
+ export class ModelDefinitions {
19
+ static readonly COMPILED_NAME = 'definitions';
20
+ /**
21
+ * A closure fully implementing the model's defined `applyCasing` behavior with
22
+ * the function parameter preset to the version-appropriate default.
23
+ * `defaults.applyCasing` is captured as part of the closure.
24
+ *
25
+ * During compilation of some models (such as Trie-based wordlist templated models),
26
+ * this closure will be directly used as part of searchTermToKey.
27
+ *
28
+ * In compiled code, this will instead be defined in-line as an autogenerated closure
29
+ * using the other properties of the pseudoclosure.
30
+ */
31
+ applyCasing?: CasingFunction;
32
+
33
+ /**
34
+ * A closure fully implementing the model's defined `searchTermToKey` behavior
35
+ * based upon the model's specified casing rules. The `applyCasing` closure is
36
+ * itself captured within this closure.
37
+ *
38
+ * During compilation of some models (such as Trie-based wordlist templated models),
39
+ * this closure will be directly utilized when compiling the lexicon.
40
+ *
41
+ * In compiled code, this will instead be defined in-line as an autogenerated closure
42
+ * using the other properties of the pseudoclosure.
43
+ */
44
+ searchTermToKey?: WordformToKeySpec;
45
+
46
+ /**
47
+ * Contains embedded 'default' implementations that may be needed for
48
+ * closures in the compiled version, annotated with the current version
49
+ * of Developer.
50
+ */
51
+ private defaults: {
52
+ version: string;
53
+ applyCasing?: CasingFunction;
54
+ } = {
55
+ version: KEYMAN_VERSION.VERSION_WITH_TAG
56
+ };
57
+
58
+ /**
59
+ * Contains the model-specific definitions specified in the model's source.
60
+ *
61
+ * These definitions may expect `defaults.applyCasing` as a parameter in
62
+ * their final closures.
63
+ */
64
+ private model: {
65
+ applyCasing?: CasingFunction;
66
+ searchTermToKey?: WordformToKeySpec;
67
+ } = {};
68
+
69
+ constructor(modelSource: LexicalModelSource) {
70
+ // Determine the model's `applyCasing` function / implementation.
71
+ if(modelSource.languageUsesCasing) {
72
+ this.defaults.applyCasing = defaultApplyCasing;
73
+
74
+ if(modelSource.applyCasing) {
75
+ this.model.applyCasing = modelSource.applyCasing;
76
+ let _this = this;
77
+
78
+ // Since the defined casing function may expect to take our default implementation
79
+ // as a parameter, we can define the full implementation via closure capture.
80
+ this.applyCasing = function(casing: CasingForm, text: string) {
81
+ return _this.model.applyCasing(casing, text, _this.defaults.applyCasing);
82
+ };
83
+ } else {
84
+ this.applyCasing = this.defaults.applyCasing;
85
+ }
86
+ }
87
+
88
+ // START: if(model type uses keying)...
89
+
90
+ // Use the default search term to key function, if left unspecified.
91
+ if(modelSource.searchTermToKey) {
92
+ this.model.searchTermToKey = modelSource.searchTermToKey;
93
+ } else if(modelSource.languageUsesCasing) {
94
+ // applyCasing is defined here.
95
+ // Unfortunately, this only works conceptually. .toString on a closure
96
+ // does not result in proper compilation.
97
+ this.model.searchTermToKey = defaultCasedSearchTermToKey;
98
+ } else if(modelSource.languageUsesCasing == false) {
99
+ this.model.searchTermToKey = defaultSearchTermToKey;
100
+ } else {
101
+ // If languageUsesCasing is not defined, then we use pre-14.0 behavior,
102
+ // which expects a lowercased default.
103
+ this.model.searchTermToKey = defaultCasedSearchTermToKey;
104
+ // Needed to provide pre-14.0 default lowercasing as part of the
105
+ // search-term keying operation.
106
+ this.defaults.applyCasing = defaultApplyCasing;
107
+ // For compile-time use.
108
+ this.applyCasing = this.defaults.applyCasing;
109
+ }
110
+
111
+ let _this = this;
112
+ this.searchTermToKey = function(text: string) {
113
+ return _this.model.searchTermToKey(text, _this.applyCasing);
114
+ }
115
+
116
+ // END: if(model type uses keying)...
117
+ }
118
+
119
+ // ------------ end: common compile-time / run-time code ---------------
120
+
121
+ // START: handwritten compilation code (to accomplish the 'common' pattern defined above)
122
+
123
+ /**
124
+ * Writes out a compiled JS version of the pseudoclosure, preserving all function
125
+ * implementations.
126
+ *
127
+ * This should be written to the file within the same IIFE as the model but BEFORE
128
+ * the model itself, as the model will need to refer to the definitions herein.
129
+ */
130
+ compileDefinitions(): string {
131
+ let defn: string = '';
132
+ defn += `var ${PSEUDOCLOSURE} = {\n`
133
+
134
+ // ----------------------
135
+ // START - the 'defaults', which are common within the same Developer version.
136
+ defn += ` defaults: {\n version: "${this.defaults.version}"`;
137
+
138
+ // Only write out `applyCasing` if and when it is needed.
139
+ if(this.defaults.applyCasing) {
140
+ defn += `,\n applyCasing: ${this.defaults.applyCasing.toString()}`;
141
+ }
142
+
143
+ // Finalizes `defaults`
144
+ defn += `\n },`;
145
+ // END - the 'defaults'
146
+
147
+ // ----------------------
148
+ // START - model-specific definitions (when defined)
149
+ defn += ` model: {\n`;
150
+ defn += ` searchTermToKey: ${this.model.searchTermToKey.toString()}`;
151
+
152
+ if(this.model.applyCasing) {
153
+ defn += `,\n applyCasing: ${this.model.applyCasing.toString()}`;
154
+ }
155
+ defn += `\n }`
156
+ // END - model-specific definitions
157
+
158
+ // ----------------------
159
+ // START - compiled closures. Given those definitions, write out the
160
+ // pseudoclosure-referencing closures for the needed methods.
161
+
162
+ // We should be able to define these closures in-line with the object's
163
+ // initialization. Worst-case, we simply move the definitions outside
164
+ // of the pseudoclosure's init and THEN define/assign these closures to
165
+ // the object, as references will be available then for sure.
166
+ if(this.model.applyCasing) {
167
+ // A major potential issue: if the user wants to call extra custom functions that they've written.
168
+ //
169
+ // `applyCasing` recursion SHOULD be fine if they write `this.applyCasing() and forward all arguments
170
+ // appropriately, as it will be known as `applyCasing` on the runtime `this` (`model`) object.
171
+ //
172
+ // Similarly, as long as any helper functions are similarly compiled and stored as part of `model`,
173
+ // they should be accessible too. The issue would be to actually allow use of extra custom funcs
174
+ // and include them as part of this object as part of compilation.
175
+ defn += `,\n applyCasing: function(caseToApply, text) {
176
+ return ${PSEUDOCLOSURE}.model.applyCasing(caseToApply, text, ${PSEUDOCLOSURE}.defaults.applyCasing);
177
+ }`;
178
+ } else if(this.defaults.applyCasing) {
179
+ // We can't directly assign from `.defaults`, as initialization-time field reads
180
+ // are not permitted within JS. Function references, however, are valid.
181
+ defn += `,\n applyCasing: function(caseToApply, text) {
182
+ return ${PSEUDOCLOSURE}.defaults.applyCasing(caseToApply, text);
183
+ }`;
184
+ }
185
+
186
+ // if(this.searchTermToKey) {
187
+ defn += `,\n searchTermToKey: function(text) {
188
+ return ${PSEUDOCLOSURE}.model.searchTermToKey(text, ${PSEUDOCLOSURE}.applyCasing);
189
+ }`;
190
+ // }
191
+
192
+ // END - compiled closures.
193
+
194
+ // ----------------------
195
+ // Finalize the definition of... `definitions`.
196
+ defn += `\n};\n`;
197
+
198
+ return defn;
199
+ }
200
+
201
+ /**
202
+ * Compiles the model-options entry for `searchTermToKey` in reference to the
203
+ * compiled pseudoclosure.
204
+ */
205
+ compileSearchTermToKey(): string {
206
+ // Simply point the model to the constructed closure defined by `compilePseudoclosure`.
207
+ // See "START - compiled closures" section.
208
+ return `${PSEUDOCLOSURE}.searchTermToKey`;
209
+ }
210
+
211
+ /**
212
+ * Compiles the model-options entry for `applyCasing` in reference to the
213
+ * compiled pseudoclosure.
214
+ */
215
+ compileApplyCasing(): string {// Simply point the model to the constructed closure defined by `compilePseudoclosure`.
216
+ // See "START - compiled closures" section.
217
+ return `${PSEUDOCLOSURE}.applyCasing`;
218
+ }
219
+ }
220
+
221
+ // Because it references the class field, this line must come afterward.
222
+ const PSEUDOCLOSURE = ModelDefinitions.COMPILED_NAME;
@@ -0,0 +1,75 @@
1
+ import { OverrideScriptDefaults } from "./lexical-model.js";
2
+
3
+ export function decorateWithScriptOverrides(breaker: WordBreakingFunction, option: OverrideScriptDefaults) {
4
+ if (option !== 'break-words-at-spaces') {
5
+ throw new Error(`Unsupported script override: ${option}`)
6
+ }
7
+
8
+ /**
9
+ * Matches if when a span contains a Southeast-Asian letter or mark anywhere.
10
+ * This makes it a candidate for joining.
11
+ *
12
+ * See: tools/create-override-script-regexp.ts for how this RegExp was
13
+ * generated.
14
+ *
15
+ * Last updated for Unicode 13.0.0.
16
+ */
17
+ const HAS_SOUTHEAST_ASIAN_LETTER = /[\u0E01-\u0E3A\u0E40-\u0E4E\u0E81\u0E82\u0E84\u0E86-\u0E8A\u0E8C-\u0EA3\u0EA5\u0EA7-\u0EBD\u0EC0-\u0EC4\u0EC6\u0EC8-\u0ECD\u0EDC-\u0EDF\u1000-\u103F\u1050-\u108F\u109A-\u109D\u1780-\u17D3\u17D7\u17DC\u17DD\u30A1-\u30FA\u30FC-\u30FF]/;
18
+
19
+ return function enhancedBreaker(phrase: string): Span[] {
20
+ let originalSpans = breaker(phrase);
21
+
22
+ if (originalSpans.length === 0) {
23
+ return [];
24
+ }
25
+
26
+ let outputSpans = [originalSpans.shift()];
27
+ for (let currentSpan of originalSpans) {
28
+ let previousSpan = lastFrom(outputSpans);
29
+
30
+ if (spansAreBackToBack(previousSpan, currentSpan) &&
31
+ hasSouthEastAsianLetter(previousSpan) &&
32
+ hasSouthEastAsianLetter(currentSpan)
33
+ ) {
34
+ // previous span SHOULD be joined with current!
35
+ outputSpans[outputSpans.length - 1] = concatenateSpans(previousSpan, currentSpan);
36
+ } else {
37
+ outputSpans.push(currentSpan);
38
+ }
39
+ }
40
+
41
+ return outputSpans;
42
+ }
43
+
44
+ function hasSouthEastAsianLetter(span: Span) {
45
+ return HAS_SOUTHEAST_ASIAN_LETTER.test(span.text);
46
+ }
47
+
48
+ /**
49
+ * Returns true when the spans are contiguous.
50
+ * Order matters when calling this function!
51
+ */
52
+ function spansAreBackToBack(former: Span, latter: Span): boolean {
53
+ return former.end === latter.start;
54
+ }
55
+
56
+ function concatenateSpans(former: Span, latter: Span) {
57
+ if (latter.start !== former.end) {
58
+ throw new Error(`Cannot concatenate non-contiguous spans: ${JSON.stringify(former)}/${JSON.stringify(latter)}`);
59
+ }
60
+
61
+ return {
62
+ start: former.start,
63
+ end: latter.end,
64
+ length: former.length + latter.length,
65
+ text: former.text + latter.text
66
+ };
67
+ }
68
+
69
+ /**
70
+ * Get the last element from the array.
71
+ */
72
+ function lastFrom<T>(array: T[]): T | undefined {
73
+ return array[array.length - 1];
74
+ }
75
+ }
package/test/README.md ADDED
@@ -0,0 +1,15 @@
1
+ Lexical Model and Package Compiler
2
+ ==================================
3
+
4
+ Build
5
+ -----
6
+
7
+ npm run build
8
+
9
+
10
+ Test
11
+ ----
12
+
13
+ npm test
14
+
15
+ **NOTE**: ensure you run `npm run build` before running the tests!
@@ -0,0 +1,10 @@
1
+ const source: LexicalModelSource = {
2
+ format: 'trie-1.0',
3
+ sources: ['wordlist.tsv'],
4
+ /* Keyman 14.0+ word breaker specification: */
5
+ wordBreaker: {
6
+ use: 'default',
7
+ joinWordsAt: [':', '-', '·']
8
+ }
9
+ };
10
+ export default source;
@@ -0,0 +1,3 @@
1
+ Kanien'kehá:ka 1000
2
+ amiskwaciy-wâskahikan 100
3
+ cel·la 10
@@ -0,0 +1,10 @@
1
+ const source: LexicalModelSource = {
2
+ format: 'trie-1.0',
3
+ sources: ['wordlist.tsv'],
4
+ /* Keyman 14.0+ word breaker specification: */
5
+ wordBreaker: {
6
+ use: 'default',
7
+ overrideScriptDefaults: 'break-words-at-spaces',
8
+ }
9
+ };
10
+ export default source;
@@ -0,0 +1,8 @@
1
+ ເກ໌າະ 675
2
+ ເຢາະ 285
3
+ ຢັອຫ 246
4
+ ສນາ 99
5
+ ບຣອມ 69
6
+ ລະ 40
7
+ ຈາກ 26
8
+ ກ໌ນິ 26
@@ -0,0 +1,45 @@
1
+ {
2
+ "system": {
3
+ "keymanDeveloperVersion": "12.0.1500.0",
4
+ "fileVersion": "12.0"
5
+ },
6
+ "options": {},
7
+ "info": {
8
+ "author": {
9
+ "description": "Eddie Antonio Santos",
10
+ "url": "mailto:Eddie.Santos@nrc-cnrc.gc.ca"
11
+ },
12
+ "copyright": {
13
+ "description": "© 2019 National Research Council Canada"
14
+ },
15
+ "name": {
16
+ "description": "SENĆOŦEN (Saanich Dialect) Lexical Model"
17
+ },
18
+ "version": {
19
+ "description": "1.0.3"
20
+ }
21
+ },
22
+ "files": [
23
+ {
24
+ "name": "example.qaa.sencoten.model.js",
25
+ "description": "Lexical model example.qaa.sencoten.model.js",
26
+ "copyLocation": 0
27
+ }
28
+ ],
29
+ "lexicalModels": [
30
+ {
31
+ "name": "SENĆOŦEN dictionary",
32
+ "id": "example.qaa.sencoten",
33
+ "languages": [
34
+ {
35
+ "name": "North Straits Salish",
36
+ "id": "qaa"
37
+ },
38
+ {
39
+ "name": "SENĆOŦEN",
40
+ "id": "qaa-Latn"
41
+ }
42
+ ]
43
+ }
44
+ ]
45
+ }
@@ -0,0 +1,35 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <Package>
3
+ <System>
4
+ <KeymanDeveloperVersion>12.0.1500.0</KeymanDeveloperVersion>
5
+ <FileVersion>12.0</FileVersion>
6
+ </System>
7
+ <Options>
8
+ <FollowKeyboardVersion/>
9
+ </Options>
10
+ <Info>
11
+ <Name URL="">SENĆOŦEN (Saanich Dialect) Lexical Model</Name>
12
+ <Copyright URL="">© 2019 National Research Council Canada</Copyright>
13
+ <Author URL="mailto:Eddie.Santos@nrc-cnrc.gc.ca">Eddie Antonio Santos</Author>
14
+ <Version>1.0.3</Version>
15
+ </Info>
16
+ <Files>
17
+ <File>
18
+ <Name>..\build\example.qaa.sencoten.model.js</Name>
19
+ <Description>Lexical model example.qaa.sencoten.model.js</Description>
20
+ <CopyLocation>0</CopyLocation>
21
+ <FileType>.model.js</FileType>
22
+ </File>
23
+ </Files>
24
+ <LexicalModels>
25
+ <LexicalModel>
26
+ <Name>SENĆOŦEN dictionary</Name>
27
+ <ID>example.qaa.sencoten</ID>
28
+ <Version>1.0.3</Version>
29
+ <Languages>
30
+ <Language ID="qaa">North Straits Salish</Language>
31
+ <Language ID="qaa-Latn">SENĆOŦEN</Language>
32
+ </Languages>
33
+ </LexicalModel>
34
+ </LexicalModels>
35
+ </Package>
@@ -0,0 +1,6 @@
1
+ const source: LexicalModelSource = {
2
+ format: 'trie-1.0',
3
+ sources: ['wordlist.tsv'],
4
+ wordBreaker: 'default',
5
+ };
6
+ export default source;
@@ -0,0 +1,10 @@
1
+ TŦE 13644
2
+ E 9134
3
+ SEN 4816
4
+ Ȼ 3479
5
+ SW̱ 2621
6
+ NIȽ 2314
7
+ U¸ 2298
8
+ I¸ 1988
9
+ ȻSE 1925
10
+ I 1884
@@ -0,0 +1,6 @@
1
+ const source: LexicalModelSource = {
2
+ format: 'trie-1.0',
3
+ sources: ['wordlist.tsv'],
4
+ wordBreaker: 'default',
5
+ };
6
+ export default source;
@@ -0,0 +1,5 @@
1
+ CRAZ🤪 13644
2
+ 🙄 9134
3
+ 😇 4816
4
+ 🇸
5
+ 🇺
@@ -0,0 +1,5 @@
1
+ const source: LexicalModelSource = {
2
+ format: 'trie-1.0',
3
+ sources: ['wordlist.tsv'],
4
+ };
5
+ export default source;
@@ -0,0 +1,3 @@
1
+ I
2
+ like
3
+ turtles
@@ -0,0 +1,5 @@
1
+ const source: LexicalModelSource = {
2
+ format: 'trie-1.0',
3
+ sources: ['wordlist.txt'],
4
+ };
5
+ export default source;
@@ -0,0 +1,5 @@
1
+ const source: LexicalModelSource = {
2
+ format: 'trie-1.0',
3
+ sources: ['wordlist.txt'],
4
+ };
5
+ export default source;
@@ -0,0 +1,9 @@
1
+ const source: LexicalModelSource = {
2
+ format: 'trie-1.0',
3
+ sources: ['wordlist.tsv'],
4
+ /* Keyman 14.0+ word breaker specification: */
5
+ wordBreaker: {
6
+ use: 'default'
7
+ }
8
+ };
9
+ export default source;
@@ -0,0 +1,3 @@
1
+ I
2
+ like
3
+ turtles