@keymanapp/kmc-model 17.0.85-alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.nyc_output/coverage-10524-1681239236645-0.json +1 -0
- package/Makefile +38 -0
- package/build/cjs-src/lexical-model-compiler.cjs +152688 -0
- package/build/src/build-trie.d.ts +40 -0
- package/build/src/build-trie.d.ts.map +1 -0
- package/build/src/build-trie.js +362 -0
- package/build/src/build-trie.js.map +1 -0
- package/build/src/join-word-breaker-decorator.d.ts +10 -0
- package/build/src/join-word-breaker-decorator.d.ts.map +1 -0
- package/build/src/join-word-breaker-decorator.js +121 -0
- package/build/src/join-word-breaker-decorator.js.map +1 -0
- package/build/src/lexical-model-compiler.d.ts +19 -0
- package/build/src/lexical-model-compiler.d.ts.map +1 -0
- package/build/src/lexical-model-compiler.js +155 -0
- package/build/src/lexical-model-compiler.js.map +1 -0
- package/build/src/lexical-model.d.ts +135 -0
- package/build/src/lexical-model.d.ts.map +1 -0
- package/build/src/lexical-model.js +6 -0
- package/build/src/lexical-model.js.map +1 -0
- package/build/src/main.d.ts +15 -0
- package/build/src/main.d.ts.map +1 -0
- package/build/src/main.js +46 -0
- package/build/src/main.js.map +1 -0
- package/build/src/model-compiler-errors.d.ts +77 -0
- package/build/src/model-compiler-errors.d.ts.map +1 -0
- package/build/src/model-compiler-errors.js +156 -0
- package/build/src/model-compiler-errors.js.map +1 -0
- package/build/src/model-defaults.d.ts +56 -0
- package/build/src/model-defaults.d.ts.map +1 -0
- package/build/src/model-defaults.js +106 -0
- package/build/src/model-defaults.js.map +1 -0
- package/build/src/model-definitions.d.ts +71 -0
- package/build/src/model-definitions.d.ts.map +1 -0
- package/build/src/model-definitions.js +189 -0
- package/build/src/model-definitions.js.map +1 -0
- package/build/src/script-overrides-decorator.d.ts +4 -0
- package/build/src/script-overrides-decorator.d.ts.map +1 -0
- package/build/src/script-overrides-decorator.js +63 -0
- package/build/src/script-overrides-decorator.js.map +1 -0
- package/build/test/helpers/index.d.ts +69 -0
- package/build/test/helpers/index.d.ts.map +1 -0
- package/build/test/helpers/index.js +160 -0
- package/build/test/helpers/index.js.map +1 -0
- package/build/test/test-compile-model-with-pseudoclosure.d.ts +2 -0
- package/build/test/test-compile-model-with-pseudoclosure.d.ts.map +1 -0
- package/build/test/test-compile-model-with-pseudoclosure.js +200 -0
- package/build/test/test-compile-model-with-pseudoclosure.js.map +1 -0
- package/build/test/test-compile-model.d.ts +2 -0
- package/build/test/test-compile-model.d.ts.map +1 -0
- package/build/test/test-compile-model.js +30 -0
- package/build/test/test-compile-model.js.map +1 -0
- package/build/test/test-compile-trie.d.ts +2 -0
- package/build/test/test-compile-trie.d.ts.map +1 -0
- package/build/test/test-compile-trie.js +125 -0
- package/build/test/test-compile-trie.js.map +1 -0
- package/build/test/test-default-apply-case.d.ts +2 -0
- package/build/test/test-default-apply-case.d.ts.map +1 -0
- package/build/test/test-default-apply-case.js +105 -0
- package/build/test/test-default-apply-case.js.map +1 -0
- package/build/test/test-default-search-term-to-key.d.ts +2 -0
- package/build/test/test-default-search-term-to-key.d.ts.map +1 -0
- package/build/test/test-default-search-term-to-key.js +148 -0
- package/build/test/test-default-search-term-to-key.js.map +1 -0
- package/build/test/test-error-logger.d.ts +2 -0
- package/build/test/test-error-logger.d.ts.map +1 -0
- package/build/test/test-error-logger.js +26 -0
- package/build/test/test-error-logger.js.map +1 -0
- package/build/test/test-join-word-breaker.d.ts +2 -0
- package/build/test/test-join-word-breaker.d.ts.map +1 -0
- package/build/test/test-join-word-breaker.js +84 -0
- package/build/test/test-join-word-breaker.js.map +1 -0
- package/build/test/test-model-definitions.d.ts +2 -0
- package/build/test/test-model-definitions.d.ts.map +1 -0
- package/build/test/test-model-definitions.js +165 -0
- package/build/test/test-model-definitions.js.map +1 -0
- package/build/test/test-override-script-defaults.d.ts +2 -0
- package/build/test/test-override-script-defaults.d.ts.map +1 -0
- package/build/test/test-override-script-defaults.js +28 -0
- package/build/test/test-override-script-defaults.js.map +1 -0
- package/build/test/test-parse-wordlist.d.ts +2 -0
- package/build/test/test-parse-wordlist.d.ts.map +1 -0
- package/build/test/test-parse-wordlist.js +110 -0
- package/build/test/test-parse-wordlist.js.map +1 -0
- package/build/test/test-punctuation.d.ts +2 -0
- package/build/test/test-punctuation.d.ts.map +1 -0
- package/build/test/test-punctuation.js +31 -0
- package/build/test/test-punctuation.js.map +1 -0
- package/build/test/tsconfig.tsbuildinfo +1 -0
- package/build/test/wordbreakers/data.d.ts +35 -0
- package/build/test/wordbreakers/data.d.ts.map +1 -0
- package/build/test/wordbreakers/data.js +1778 -0
- package/build/test/wordbreakers/data.js.map +1 -0
- package/build/test/wordbreakers/default-wordbreaker-esm.d.ts +10 -0
- package/build/test/wordbreakers/default-wordbreaker-esm.d.ts.map +1 -0
- package/build/test/wordbreakers/default-wordbreaker-esm.js +354 -0
- package/build/test/wordbreakers/default-wordbreaker-esm.js.map +1 -0
- package/build/tsconfig.tsbuildinfo +1 -0
- package/build.sh +73 -0
- package/coverage/lcov-report/base.css +224 -0
- package/coverage/lcov-report/block-navigation.js +87 -0
- package/coverage/lcov-report/favicon.png +0 -0
- package/coverage/lcov-report/index.html +161 -0
- package/coverage/lcov-report/prettify.css +1 -0
- package/coverage/lcov-report/prettify.js +2 -0
- package/coverage/lcov-report/sort-arrow-sprite.png +0 -0
- package/coverage/lcov-report/sorter.js +196 -0
- package/coverage/lcov-report/src/build-trie.ts.html +1618 -0
- package/coverage/lcov-report/src/index.html +221 -0
- package/coverage/lcov-report/src/join-word-breaker-decorator.ts.html +487 -0
- package/coverage/lcov-report/src/lexical-model-compiler.ts.html +622 -0
- package/coverage/lcov-report/src/main.ts.html +271 -0
- package/coverage/lcov-report/src/model-compiler-errors.ts.html +691 -0
- package/coverage/lcov-report/src/model-defaults.ts.html +415 -0
- package/coverage/lcov-report/src/model-definitions.ts.html +748 -0
- package/coverage/lcov-report/src/script-overrides-decorator.ts.html +310 -0
- package/coverage/lcov-report/test/helpers/index.html +116 -0
- package/coverage/lcov-report/test/helpers/index.ts.html +646 -0
- package/coverage/lcov-report/test/index.html +266 -0
- package/coverage/lcov-report/test/test-compile-model-with-pseudoclosure.ts.html +802 -0
- package/coverage/lcov-report/test/test-compile-model.ts.html +187 -0
- package/coverage/lcov-report/test/test-compile-trie.ts.html +541 -0
- package/coverage/lcov-report/test/test-default-apply-case.ts.html +466 -0
- package/coverage/lcov-report/test/test-default-search-term-to-key.ts.html +628 -0
- package/coverage/lcov-report/test/test-error-logger.ts.html +196 -0
- package/coverage/lcov-report/test/test-join-word-breaker.ts.html +376 -0
- package/coverage/lcov-report/test/test-model-definitions.ts.html +676 -0
- package/coverage/lcov-report/test/test-override-script-defaults.ts.html +184 -0
- package/coverage/lcov-report/test/test-parse-wordlist.ts.html +466 -0
- package/coverage/lcov-report/test/test-punctuation.ts.html +190 -0
- package/coverage/lcov-report/test/wordbreakers/data.ts.html +5413 -0
- package/coverage/lcov-report/test/wordbreakers/default-wordbreaker-esm.ts.html +1234 -0
- package/coverage/lcov-report/test/wordbreakers/index.html +131 -0
- package/coverage/lcov.info +5969 -0
- package/package.json +61 -0
- package/src/build-trie.ts +511 -0
- package/src/join-word-breaker-decorator.ts +134 -0
- package/src/lexical-model-compiler.ts +179 -0
- package/src/lexical-model.ts +150 -0
- package/src/main.ts +62 -0
- package/src/model-compiler-errors.ts +203 -0
- package/src/model-defaults.ts +111 -0
- package/src/model-definitions.ts +222 -0
- package/src/script-overrides-decorator.ts +75 -0
- package/test/README.md +15 -0
- package/test/fixtures/example.qaa.joinwordbreaker/example.qaa.joinwordbreaker.model.ts +10 -0
- package/test/fixtures/example.qaa.joinwordbreaker/wordlist.tsv +3 -0
- package/test/fixtures/example.qaa.scriptusesspaces/example.qaa.scriptusesspaces.model.ts +10 -0
- package/test/fixtures/example.qaa.scriptusesspaces/wordlist.tsv +8 -0
- package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.kmp.json +45 -0
- package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.kps +35 -0
- package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.ts +6 -0
- package/test/fixtures/example.qaa.sencoten/wordlist.tsv +10 -0
- package/test/fixtures/example.qaa.smp/example.qaa.smp.model.ts +6 -0
- package/test/fixtures/example.qaa.smp/wordlist.tsv +5 -0
- package/test/fixtures/example.qaa.trivial/example.qaa.trivial.model.ts +5 -0
- package/test/fixtures/example.qaa.trivial/wordlist.tsv +3 -0
- package/test/fixtures/example.qaa.utf16be/example.qaa.utf16be.model.ts +5 -0
- package/test/fixtures/example.qaa.utf16be/wordlist.txt +0 -0
- package/test/fixtures/example.qaa.utf16le/example.qaa.utf16le.model.ts +5 -0
- package/test/fixtures/example.qaa.utf16le/wordlist.txt +0 -0
- package/test/fixtures/example.qaa.wordbreaker/example.qaa.wordbreaker.model.ts +9 -0
- package/test/fixtures/example.qaa.wordbreaker/wordlist.tsv +3 -0
- package/test/helpers/index.ts +187 -0
- package/test/test-compile-model-with-pseudoclosure.ts +239 -0
- package/test/test-compile-model.ts +34 -0
- package/test/test-compile-trie.ts +152 -0
- package/test/test-default-apply-case.ts +128 -0
- package/test/test-default-search-term-to-key.ts +181 -0
- package/test/test-error-logger.ts +38 -0
- package/test/test-join-word-breaker.ts +97 -0
- package/test/test-model-definitions.ts +198 -0
- package/test/test-override-script-defaults.ts +33 -0
- package/test/test-parse-wordlist.ts +127 -0
- package/test/test-punctuation.ts +35 -0
- package/test/tsconfig.json +22 -0
- package/test/wordbreakers/README.md +3 -0
- package/test/wordbreakers/data.ts +1776 -0
- package/test/wordbreakers/default-wordbreaker-esm.ts +383 -0
- package/tools/create-override-script-regexp.ts +145 -0
- package/tsconfig.json +17 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Converts wordforms into an indexable form. It does this by
|
|
3
|
+
* normalizing the letter case of characters INDIVIDUALLY (to disregard
|
|
4
|
+
* context-sensitive case transformations), normalizing to NFKD form,
|
|
5
|
+
* and removing common diacritical marks.
|
|
6
|
+
*
|
|
7
|
+
* This is a very speculative implementation, that might work with
|
|
8
|
+
* your language. We don't guarantee that this will be perfect for your
|
|
9
|
+
* language, but it's a start.
|
|
10
|
+
*
|
|
11
|
+
* This uses String.prototype.normalize() to convert normalize into NFKD.
|
|
12
|
+
* NFKD neutralizes some funky distinctions, e.g., ꬲ, e, e should all be the
|
|
13
|
+
* same character; plus, it's an easy way to separate a Latin character from
|
|
14
|
+
* its diacritics; Even then, orthographies regularly use code points
|
|
15
|
+
* that, under NFKD normalization, do NOT decompose appropriately for your
|
|
16
|
+
* language (e.g., SENĆOŦEN, Plains Cree in syllabics).
|
|
17
|
+
*
|
|
18
|
+
* Use this in early iterations of the model. For a production lexical model,
|
|
19
|
+
* you will probably write/generate your own key function, tailored to your
|
|
20
|
+
* language. There is a chance the default will work properly out of the box.
|
|
21
|
+
*/
|
|
22
|
+
export function defaultSearchTermToKey(wordform: string): string {
|
|
23
|
+
return wordform
|
|
24
|
+
.normalize('NFKD')
|
|
25
|
+
// Remove any combining diacritics (if input is in NFKD)
|
|
26
|
+
.replace(/[\u0300-\u036F]/g, '')
|
|
27
|
+
// Replace directional quotation marks with plain apostrophes
|
|
28
|
+
.replace(/‘/, "'")
|
|
29
|
+
.replace(/’/, "'")
|
|
30
|
+
// Also double-quote marks.
|
|
31
|
+
.replace(/“/, '"')
|
|
32
|
+
.replace(/”/, '"');
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Converts wordforms into an indexable form. It does this by
|
|
37
|
+
* normalizing the letter case of characters INDIVIDUALLY (to disregard
|
|
38
|
+
* context-sensitive case transformations), normalizing to NFKD form,
|
|
39
|
+
* and removing common diacritical marks.
|
|
40
|
+
*
|
|
41
|
+
* This is a very speculative implementation, that might work with
|
|
42
|
+
* your language. We don't guarantee that this will be perfect for your
|
|
43
|
+
* language, but it's a start.
|
|
44
|
+
*
|
|
45
|
+
* This uses String.prototype.normalize() to convert normalize into NFKD.
|
|
46
|
+
* NFKD neutralizes some funky distinctions, e.g., ꬲ, e, e should all be the
|
|
47
|
+
* same character; plus, it's an easy way to separate a Latin character from
|
|
48
|
+
* its diacritics; Even then, orthographies regularly use code points
|
|
49
|
+
* that, under NFKD normalization, do NOT decompose appropriately for your
|
|
50
|
+
* language (e.g., SENĆOŦEN, Plains Cree in syllabics).
|
|
51
|
+
*
|
|
52
|
+
* Use this in early iterations of the model. For a production lexical model,
|
|
53
|
+
* you will probably write/generate your own key function, tailored to your
|
|
54
|
+
* language. There is a chance the default will work properly out of the box.
|
|
55
|
+
*/
|
|
56
|
+
export function defaultCasedSearchTermToKey(wordform: string, applyCasing: CasingFunction): string {
|
|
57
|
+
// While this is a bit WET, as the basic `defaultSearchTermToKey` exists and performs some of
|
|
58
|
+
// the same functions, repetition is the easiest way to allow the function to be safely compiled
|
|
59
|
+
// with ease by use of `.toString()`.
|
|
60
|
+
return Array.from(wordform
|
|
61
|
+
.normalize('NFKD')
|
|
62
|
+
// Remove any combining diacritics (if input is in NFKD)
|
|
63
|
+
.replace(/[\u0300-\u036F]/g, '')
|
|
64
|
+
) // end of `Array.from`
|
|
65
|
+
.map(function(c) { return applyCasing('lower', c)})
|
|
66
|
+
.join('')
|
|
67
|
+
// Replace directional quotation marks with plain apostrophes
|
|
68
|
+
.replace(/‘/, "'")
|
|
69
|
+
.replace(/’/, "'")
|
|
70
|
+
// Also double-quote marks.
|
|
71
|
+
.replace(/“/, '"')
|
|
72
|
+
.replace(/”/, '"');
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Specifies default casing behavior for lexical models when `languageUsesCasing` is
|
|
77
|
+
* set to true.
|
|
78
|
+
* @param casing One of 'lower' (lowercased), 'upper' (uppercased), or 'initial'.
|
|
79
|
+
*
|
|
80
|
+
* 'initial' is designed to cover cases like sentence-initial & proper noun capitalization in English.
|
|
81
|
+
* This may be overwritten as appropriate in model-specific implementations.
|
|
82
|
+
* @param text The text to be modified.
|
|
83
|
+
*/
|
|
84
|
+
export function defaultApplyCasing(casing: CasingForm, text: string): string {
|
|
85
|
+
switch(casing) {
|
|
86
|
+
case 'lower':
|
|
87
|
+
return text.toLowerCase();
|
|
88
|
+
case 'upper':
|
|
89
|
+
return text.toUpperCase();
|
|
90
|
+
case 'initial':
|
|
91
|
+
var headCode = text.charCodeAt(0);
|
|
92
|
+
// The length of the first code unit, as measured in code points.
|
|
93
|
+
var headUnitLength = 1;
|
|
94
|
+
|
|
95
|
+
// Is the first character a high surrogate, indicating possible use of UTF-16
|
|
96
|
+
// surrogate pairs? Also, is the string long enough for there to BE a pair?
|
|
97
|
+
if(text.length > 1 && headCode >= 0xD800 && headCode <= 0xDBFF) {
|
|
98
|
+
// It's possible, so now we check for low surrogates.
|
|
99
|
+
var lowSurrogateCode = text.charCodeAt(1);
|
|
100
|
+
|
|
101
|
+
if(lowSurrogateCode >= 0xDC00 && lowSurrogateCode <= 0xDFFF) {
|
|
102
|
+
// We have a surrogate pair; this pair is the 'first' character.
|
|
103
|
+
headUnitLength++;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Capitalizes the first code unit of the string, leaving the rest intact.
|
|
108
|
+
return text.substring(0, headUnitLength).toUpperCase() // head - uppercased
|
|
109
|
+
.concat(text.substring(headUnitLength)); // tail - lowercased
|
|
110
|
+
}
|
|
111
|
+
}
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import { defaultApplyCasing,
|
|
2
|
+
defaultCasedSearchTermToKey,
|
|
3
|
+
defaultSearchTermToKey
|
|
4
|
+
} from "./model-defaults.js";
|
|
5
|
+
|
|
6
|
+
import KEYMAN_VERSION from "@keymanapp/keyman-version";
|
|
7
|
+
import { LexicalModelSource, WordformToKeySpec } from "./lexical-model.js";
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Processes certain defined model behaviors in such a way that the needed closures
|
|
11
|
+
* may be safely compiled to a JS file and loaded within the LMLayer.
|
|
12
|
+
*
|
|
13
|
+
* This is accomplished by writing out a 'pseudoclosure' within the model's IIFE,
|
|
14
|
+
* then used to build _actual_ closures at LMLayer load time. This 'pseudoclosure'
|
|
15
|
+
* will very closely match the organizational patterns of this class in order to
|
|
16
|
+
* facilitate the maintenance of this approach.
|
|
17
|
+
*/
|
|
18
|
+
export class ModelDefinitions {
|
|
19
|
+
static readonly COMPILED_NAME = 'definitions';
|
|
20
|
+
/**
|
|
21
|
+
* A closure fully implementing the model's defined `applyCasing` behavior with
|
|
22
|
+
* the function parameter preset to the version-appropriate default.
|
|
23
|
+
* `defaults.applyCasing` is captured as part of the closure.
|
|
24
|
+
*
|
|
25
|
+
* During compilation of some models (such as Trie-based wordlist templated models),
|
|
26
|
+
* this closure will be directly used as part of searchTermToKey.
|
|
27
|
+
*
|
|
28
|
+
* In compiled code, this will instead be defined in-line as an autogenerated closure
|
|
29
|
+
* using the other properties of the pseudoclosure.
|
|
30
|
+
*/
|
|
31
|
+
applyCasing?: CasingFunction;
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* A closure fully implementing the model's defined `searchTermToKey` behavior
|
|
35
|
+
* based upon the model's specified casing rules. The `applyCasing` closure is
|
|
36
|
+
* itself captured within this closure.
|
|
37
|
+
*
|
|
38
|
+
* During compilation of some models (such as Trie-based wordlist templated models),
|
|
39
|
+
* this closure will be directly utilized when compiling the lexicon.
|
|
40
|
+
*
|
|
41
|
+
* In compiled code, this will instead be defined in-line as an autogenerated closure
|
|
42
|
+
* using the other properties of the pseudoclosure.
|
|
43
|
+
*/
|
|
44
|
+
searchTermToKey?: WordformToKeySpec;
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Contains embedded 'default' implementations that may be needed for
|
|
48
|
+
* closures in the compiled version, annotated with the current version
|
|
49
|
+
* of Developer.
|
|
50
|
+
*/
|
|
51
|
+
private defaults: {
|
|
52
|
+
version: string;
|
|
53
|
+
applyCasing?: CasingFunction;
|
|
54
|
+
} = {
|
|
55
|
+
version: KEYMAN_VERSION.VERSION_WITH_TAG
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Contains the model-specific definitions specified in the model's source.
|
|
60
|
+
*
|
|
61
|
+
* These definitions may expect `defaults.applyCasing` as a parameter in
|
|
62
|
+
* their final closures.
|
|
63
|
+
*/
|
|
64
|
+
private model: {
|
|
65
|
+
applyCasing?: CasingFunction;
|
|
66
|
+
searchTermToKey?: WordformToKeySpec;
|
|
67
|
+
} = {};
|
|
68
|
+
|
|
69
|
+
constructor(modelSource: LexicalModelSource) {
|
|
70
|
+
// Determine the model's `applyCasing` function / implementation.
|
|
71
|
+
if(modelSource.languageUsesCasing) {
|
|
72
|
+
this.defaults.applyCasing = defaultApplyCasing;
|
|
73
|
+
|
|
74
|
+
if(modelSource.applyCasing) {
|
|
75
|
+
this.model.applyCasing = modelSource.applyCasing;
|
|
76
|
+
let _this = this;
|
|
77
|
+
|
|
78
|
+
// Since the defined casing function may expect to take our default implementation
|
|
79
|
+
// as a parameter, we can define the full implementation via closure capture.
|
|
80
|
+
this.applyCasing = function(casing: CasingForm, text: string) {
|
|
81
|
+
return _this.model.applyCasing(casing, text, _this.defaults.applyCasing);
|
|
82
|
+
};
|
|
83
|
+
} else {
|
|
84
|
+
this.applyCasing = this.defaults.applyCasing;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// START: if(model type uses keying)...
|
|
89
|
+
|
|
90
|
+
// Use the default search term to key function, if left unspecified.
|
|
91
|
+
if(modelSource.searchTermToKey) {
|
|
92
|
+
this.model.searchTermToKey = modelSource.searchTermToKey;
|
|
93
|
+
} else if(modelSource.languageUsesCasing) {
|
|
94
|
+
// applyCasing is defined here.
|
|
95
|
+
// Unfortunately, this only works conceptually. .toString on a closure
|
|
96
|
+
// does not result in proper compilation.
|
|
97
|
+
this.model.searchTermToKey = defaultCasedSearchTermToKey;
|
|
98
|
+
} else if(modelSource.languageUsesCasing == false) {
|
|
99
|
+
this.model.searchTermToKey = defaultSearchTermToKey;
|
|
100
|
+
} else {
|
|
101
|
+
// If languageUsesCasing is not defined, then we use pre-14.0 behavior,
|
|
102
|
+
// which expects a lowercased default.
|
|
103
|
+
this.model.searchTermToKey = defaultCasedSearchTermToKey;
|
|
104
|
+
// Needed to provide pre-14.0 default lowercasing as part of the
|
|
105
|
+
// search-term keying operation.
|
|
106
|
+
this.defaults.applyCasing = defaultApplyCasing;
|
|
107
|
+
// For compile-time use.
|
|
108
|
+
this.applyCasing = this.defaults.applyCasing;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
let _this = this;
|
|
112
|
+
this.searchTermToKey = function(text: string) {
|
|
113
|
+
return _this.model.searchTermToKey(text, _this.applyCasing);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// END: if(model type uses keying)...
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// ------------ end: common compile-time / run-time code ---------------
|
|
120
|
+
|
|
121
|
+
// START: handwritten compilation code (to accomplish the 'common' pattern defined above)
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Writes out a compiled JS version of the pseudoclosure, preserving all function
|
|
125
|
+
* implementations.
|
|
126
|
+
*
|
|
127
|
+
* This should be written to the file within the same IIFE as the model but BEFORE
|
|
128
|
+
* the model itself, as the model will need to refer to the definitions herein.
|
|
129
|
+
*/
|
|
130
|
+
compileDefinitions(): string {
|
|
131
|
+
let defn: string = '';
|
|
132
|
+
defn += `var ${PSEUDOCLOSURE} = {\n`
|
|
133
|
+
|
|
134
|
+
// ----------------------
|
|
135
|
+
// START - the 'defaults', which are common within the same Developer version.
|
|
136
|
+
defn += ` defaults: {\n version: "${this.defaults.version}"`;
|
|
137
|
+
|
|
138
|
+
// Only write out `applyCasing` if and when it is needed.
|
|
139
|
+
if(this.defaults.applyCasing) {
|
|
140
|
+
defn += `,\n applyCasing: ${this.defaults.applyCasing.toString()}`;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Finalizes `defaults`
|
|
144
|
+
defn += `\n },`;
|
|
145
|
+
// END - the 'defaults'
|
|
146
|
+
|
|
147
|
+
// ----------------------
|
|
148
|
+
// START - model-specific definitions (when defined)
|
|
149
|
+
defn += ` model: {\n`;
|
|
150
|
+
defn += ` searchTermToKey: ${this.model.searchTermToKey.toString()}`;
|
|
151
|
+
|
|
152
|
+
if(this.model.applyCasing) {
|
|
153
|
+
defn += `,\n applyCasing: ${this.model.applyCasing.toString()}`;
|
|
154
|
+
}
|
|
155
|
+
defn += `\n }`
|
|
156
|
+
// END - model-specific definitions
|
|
157
|
+
|
|
158
|
+
// ----------------------
|
|
159
|
+
// START - compiled closures. Given those definitions, write out the
|
|
160
|
+
// pseudoclosure-referencing closures for the needed methods.
|
|
161
|
+
|
|
162
|
+
// We should be able to define these closures in-line with the object's
|
|
163
|
+
// initialization. Worst-case, we simply move the definitions outside
|
|
164
|
+
// of the pseudoclosure's init and THEN define/assign these closures to
|
|
165
|
+
// the object, as references will be available then for sure.
|
|
166
|
+
if(this.model.applyCasing) {
|
|
167
|
+
// A major potential issue: if the user wants to call extra custom functions that they've written.
|
|
168
|
+
//
|
|
169
|
+
// `applyCasing` recursion SHOULD be fine if they write `this.applyCasing() and forward all arguments
|
|
170
|
+
// appropriately, as it will be known as `applyCasing` on the runtime `this` (`model`) object.
|
|
171
|
+
//
|
|
172
|
+
// Similarly, as long as any helper functions are similarly compiled and stored as part of `model`,
|
|
173
|
+
// they should be accessible too. The issue would be to actually allow use of extra custom funcs
|
|
174
|
+
// and include them as part of this object as part of compilation.
|
|
175
|
+
defn += `,\n applyCasing: function(caseToApply, text) {
|
|
176
|
+
return ${PSEUDOCLOSURE}.model.applyCasing(caseToApply, text, ${PSEUDOCLOSURE}.defaults.applyCasing);
|
|
177
|
+
}`;
|
|
178
|
+
} else if(this.defaults.applyCasing) {
|
|
179
|
+
// We can't directly assign from `.defaults`, as initialization-time field reads
|
|
180
|
+
// are not permitted within JS. Function references, however, are valid.
|
|
181
|
+
defn += `,\n applyCasing: function(caseToApply, text) {
|
|
182
|
+
return ${PSEUDOCLOSURE}.defaults.applyCasing(caseToApply, text);
|
|
183
|
+
}`;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// if(this.searchTermToKey) {
|
|
187
|
+
defn += `,\n searchTermToKey: function(text) {
|
|
188
|
+
return ${PSEUDOCLOSURE}.model.searchTermToKey(text, ${PSEUDOCLOSURE}.applyCasing);
|
|
189
|
+
}`;
|
|
190
|
+
// }
|
|
191
|
+
|
|
192
|
+
// END - compiled closures.
|
|
193
|
+
|
|
194
|
+
// ----------------------
|
|
195
|
+
// Finalize the definition of... `definitions`.
|
|
196
|
+
defn += `\n};\n`;
|
|
197
|
+
|
|
198
|
+
return defn;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Compiles the model-options entry for `searchTermToKey` in reference to the
|
|
203
|
+
* compiled pseudoclosure.
|
|
204
|
+
*/
|
|
205
|
+
compileSearchTermToKey(): string {
|
|
206
|
+
// Simply point the model to the constructed closure defined by `compilePseudoclosure`.
|
|
207
|
+
// See "START - compiled closures" section.
|
|
208
|
+
return `${PSEUDOCLOSURE}.searchTermToKey`;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Compiles the model-options entry for `applyCasing` in reference to the
|
|
213
|
+
* compiled pseudoclosure.
|
|
214
|
+
*/
|
|
215
|
+
compileApplyCasing(): string {// Simply point the model to the constructed closure defined by `compilePseudoclosure`.
|
|
216
|
+
// See "START - compiled closures" section.
|
|
217
|
+
return `${PSEUDOCLOSURE}.applyCasing`;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Because it references the class field, this line must come afterward.
|
|
222
|
+
const PSEUDOCLOSURE = ModelDefinitions.COMPILED_NAME;
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { OverrideScriptDefaults } from "./lexical-model.js";
|
|
2
|
+
|
|
3
|
+
export function decorateWithScriptOverrides(breaker: WordBreakingFunction, option: OverrideScriptDefaults) {
|
|
4
|
+
if (option !== 'break-words-at-spaces') {
|
|
5
|
+
throw new Error(`Unsupported script override: ${option}`)
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Matches if when a span contains a Southeast-Asian letter or mark anywhere.
|
|
10
|
+
* This makes it a candidate for joining.
|
|
11
|
+
*
|
|
12
|
+
* See: tools/create-override-script-regexp.ts for how this RegExp was
|
|
13
|
+
* generated.
|
|
14
|
+
*
|
|
15
|
+
* Last updated for Unicode 13.0.0.
|
|
16
|
+
*/
|
|
17
|
+
const HAS_SOUTHEAST_ASIAN_LETTER = /[\u0E01-\u0E3A\u0E40-\u0E4E\u0E81\u0E82\u0E84\u0E86-\u0E8A\u0E8C-\u0EA3\u0EA5\u0EA7-\u0EBD\u0EC0-\u0EC4\u0EC6\u0EC8-\u0ECD\u0EDC-\u0EDF\u1000-\u103F\u1050-\u108F\u109A-\u109D\u1780-\u17D3\u17D7\u17DC\u17DD\u30A1-\u30FA\u30FC-\u30FF]/;
|
|
18
|
+
|
|
19
|
+
return function enhancedBreaker(phrase: string): Span[] {
|
|
20
|
+
let originalSpans = breaker(phrase);
|
|
21
|
+
|
|
22
|
+
if (originalSpans.length === 0) {
|
|
23
|
+
return [];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
let outputSpans = [originalSpans.shift()];
|
|
27
|
+
for (let currentSpan of originalSpans) {
|
|
28
|
+
let previousSpan = lastFrom(outputSpans);
|
|
29
|
+
|
|
30
|
+
if (spansAreBackToBack(previousSpan, currentSpan) &&
|
|
31
|
+
hasSouthEastAsianLetter(previousSpan) &&
|
|
32
|
+
hasSouthEastAsianLetter(currentSpan)
|
|
33
|
+
) {
|
|
34
|
+
// previous span SHOULD be joined with current!
|
|
35
|
+
outputSpans[outputSpans.length - 1] = concatenateSpans(previousSpan, currentSpan);
|
|
36
|
+
} else {
|
|
37
|
+
outputSpans.push(currentSpan);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
return outputSpans;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function hasSouthEastAsianLetter(span: Span) {
|
|
45
|
+
return HAS_SOUTHEAST_ASIAN_LETTER.test(span.text);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Returns true when the spans are contiguous.
|
|
50
|
+
* Order matters when calling this function!
|
|
51
|
+
*/
|
|
52
|
+
function spansAreBackToBack(former: Span, latter: Span): boolean {
|
|
53
|
+
return former.end === latter.start;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function concatenateSpans(former: Span, latter: Span) {
|
|
57
|
+
if (latter.start !== former.end) {
|
|
58
|
+
throw new Error(`Cannot concatenate non-contiguous spans: ${JSON.stringify(former)}/${JSON.stringify(latter)}`);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
start: former.start,
|
|
63
|
+
end: latter.end,
|
|
64
|
+
length: former.length + latter.length,
|
|
65
|
+
text: former.text + latter.text
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Get the last element from the array.
|
|
71
|
+
*/
|
|
72
|
+
function lastFrom<T>(array: T[]): T | undefined {
|
|
73
|
+
return array[array.length - 1];
|
|
74
|
+
}
|
|
75
|
+
}
|
package/test/README.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"system": {
|
|
3
|
+
"keymanDeveloperVersion": "12.0.1500.0",
|
|
4
|
+
"fileVersion": "12.0"
|
|
5
|
+
},
|
|
6
|
+
"options": {},
|
|
7
|
+
"info": {
|
|
8
|
+
"author": {
|
|
9
|
+
"description": "Eddie Antonio Santos",
|
|
10
|
+
"url": "mailto:Eddie.Santos@nrc-cnrc.gc.ca"
|
|
11
|
+
},
|
|
12
|
+
"copyright": {
|
|
13
|
+
"description": "© 2019 National Research Council Canada"
|
|
14
|
+
},
|
|
15
|
+
"name": {
|
|
16
|
+
"description": "SENĆOŦEN (Saanich Dialect) Lexical Model"
|
|
17
|
+
},
|
|
18
|
+
"version": {
|
|
19
|
+
"description": "1.0.3"
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
"files": [
|
|
23
|
+
{
|
|
24
|
+
"name": "example.qaa.sencoten.model.js",
|
|
25
|
+
"description": "Lexical model example.qaa.sencoten.model.js",
|
|
26
|
+
"copyLocation": 0
|
|
27
|
+
}
|
|
28
|
+
],
|
|
29
|
+
"lexicalModels": [
|
|
30
|
+
{
|
|
31
|
+
"name": "SENĆOŦEN dictionary",
|
|
32
|
+
"id": "example.qaa.sencoten",
|
|
33
|
+
"languages": [
|
|
34
|
+
{
|
|
35
|
+
"name": "North Straits Salish",
|
|
36
|
+
"id": "qaa"
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"name": "SENĆOŦEN",
|
|
40
|
+
"id": "qaa-Latn"
|
|
41
|
+
}
|
|
42
|
+
]
|
|
43
|
+
}
|
|
44
|
+
]
|
|
45
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
|
2
|
+
<Package>
|
|
3
|
+
<System>
|
|
4
|
+
<KeymanDeveloperVersion>12.0.1500.0</KeymanDeveloperVersion>
|
|
5
|
+
<FileVersion>12.0</FileVersion>
|
|
6
|
+
</System>
|
|
7
|
+
<Options>
|
|
8
|
+
<FollowKeyboardVersion/>
|
|
9
|
+
</Options>
|
|
10
|
+
<Info>
|
|
11
|
+
<Name URL="">SENĆOŦEN (Saanich Dialect) Lexical Model</Name>
|
|
12
|
+
<Copyright URL="">© 2019 National Research Council Canada</Copyright>
|
|
13
|
+
<Author URL="mailto:Eddie.Santos@nrc-cnrc.gc.ca">Eddie Antonio Santos</Author>
|
|
14
|
+
<Version>1.0.3</Version>
|
|
15
|
+
</Info>
|
|
16
|
+
<Files>
|
|
17
|
+
<File>
|
|
18
|
+
<Name>..\build\example.qaa.sencoten.model.js</Name>
|
|
19
|
+
<Description>Lexical model example.qaa.sencoten.model.js</Description>
|
|
20
|
+
<CopyLocation>0</CopyLocation>
|
|
21
|
+
<FileType>.model.js</FileType>
|
|
22
|
+
</File>
|
|
23
|
+
</Files>
|
|
24
|
+
<LexicalModels>
|
|
25
|
+
<LexicalModel>
|
|
26
|
+
<Name>SENĆOŦEN dictionary</Name>
|
|
27
|
+
<ID>example.qaa.sencoten</ID>
|
|
28
|
+
<Version>1.0.3</Version>
|
|
29
|
+
<Languages>
|
|
30
|
+
<Language ID="qaa">North Straits Salish</Language>
|
|
31
|
+
<Language ID="qaa-Latn">SENĆOŦEN</Language>
|
|
32
|
+
</Languages>
|
|
33
|
+
</LexicalModel>
|
|
34
|
+
</LexicalModels>
|
|
35
|
+
</Package>
|
|
Binary file
|
|
Binary file
|