@keymanapp/kmc-model 17.0.85-alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.nyc_output/coverage-10524-1681239236645-0.json +1 -0
- package/Makefile +38 -0
- package/build/cjs-src/lexical-model-compiler.cjs +152688 -0
- package/build/src/build-trie.d.ts +40 -0
- package/build/src/build-trie.d.ts.map +1 -0
- package/build/src/build-trie.js +362 -0
- package/build/src/build-trie.js.map +1 -0
- package/build/src/join-word-breaker-decorator.d.ts +10 -0
- package/build/src/join-word-breaker-decorator.d.ts.map +1 -0
- package/build/src/join-word-breaker-decorator.js +121 -0
- package/build/src/join-word-breaker-decorator.js.map +1 -0
- package/build/src/lexical-model-compiler.d.ts +19 -0
- package/build/src/lexical-model-compiler.d.ts.map +1 -0
- package/build/src/lexical-model-compiler.js +155 -0
- package/build/src/lexical-model-compiler.js.map +1 -0
- package/build/src/lexical-model.d.ts +135 -0
- package/build/src/lexical-model.d.ts.map +1 -0
- package/build/src/lexical-model.js +6 -0
- package/build/src/lexical-model.js.map +1 -0
- package/build/src/main.d.ts +15 -0
- package/build/src/main.d.ts.map +1 -0
- package/build/src/main.js +46 -0
- package/build/src/main.js.map +1 -0
- package/build/src/model-compiler-errors.d.ts +77 -0
- package/build/src/model-compiler-errors.d.ts.map +1 -0
- package/build/src/model-compiler-errors.js +156 -0
- package/build/src/model-compiler-errors.js.map +1 -0
- package/build/src/model-defaults.d.ts +56 -0
- package/build/src/model-defaults.d.ts.map +1 -0
- package/build/src/model-defaults.js +106 -0
- package/build/src/model-defaults.js.map +1 -0
- package/build/src/model-definitions.d.ts +71 -0
- package/build/src/model-definitions.d.ts.map +1 -0
- package/build/src/model-definitions.js +189 -0
- package/build/src/model-definitions.js.map +1 -0
- package/build/src/script-overrides-decorator.d.ts +4 -0
- package/build/src/script-overrides-decorator.d.ts.map +1 -0
- package/build/src/script-overrides-decorator.js +63 -0
- package/build/src/script-overrides-decorator.js.map +1 -0
- package/build/test/helpers/index.d.ts +69 -0
- package/build/test/helpers/index.d.ts.map +1 -0
- package/build/test/helpers/index.js +160 -0
- package/build/test/helpers/index.js.map +1 -0
- package/build/test/test-compile-model-with-pseudoclosure.d.ts +2 -0
- package/build/test/test-compile-model-with-pseudoclosure.d.ts.map +1 -0
- package/build/test/test-compile-model-with-pseudoclosure.js +200 -0
- package/build/test/test-compile-model-with-pseudoclosure.js.map +1 -0
- package/build/test/test-compile-model.d.ts +2 -0
- package/build/test/test-compile-model.d.ts.map +1 -0
- package/build/test/test-compile-model.js +30 -0
- package/build/test/test-compile-model.js.map +1 -0
- package/build/test/test-compile-trie.d.ts +2 -0
- package/build/test/test-compile-trie.d.ts.map +1 -0
- package/build/test/test-compile-trie.js +125 -0
- package/build/test/test-compile-trie.js.map +1 -0
- package/build/test/test-default-apply-case.d.ts +2 -0
- package/build/test/test-default-apply-case.d.ts.map +1 -0
- package/build/test/test-default-apply-case.js +105 -0
- package/build/test/test-default-apply-case.js.map +1 -0
- package/build/test/test-default-search-term-to-key.d.ts +2 -0
- package/build/test/test-default-search-term-to-key.d.ts.map +1 -0
- package/build/test/test-default-search-term-to-key.js +148 -0
- package/build/test/test-default-search-term-to-key.js.map +1 -0
- package/build/test/test-error-logger.d.ts +2 -0
- package/build/test/test-error-logger.d.ts.map +1 -0
- package/build/test/test-error-logger.js +26 -0
- package/build/test/test-error-logger.js.map +1 -0
- package/build/test/test-join-word-breaker.d.ts +2 -0
- package/build/test/test-join-word-breaker.d.ts.map +1 -0
- package/build/test/test-join-word-breaker.js +84 -0
- package/build/test/test-join-word-breaker.js.map +1 -0
- package/build/test/test-model-definitions.d.ts +2 -0
- package/build/test/test-model-definitions.d.ts.map +1 -0
- package/build/test/test-model-definitions.js +165 -0
- package/build/test/test-model-definitions.js.map +1 -0
- package/build/test/test-override-script-defaults.d.ts +2 -0
- package/build/test/test-override-script-defaults.d.ts.map +1 -0
- package/build/test/test-override-script-defaults.js +28 -0
- package/build/test/test-override-script-defaults.js.map +1 -0
- package/build/test/test-parse-wordlist.d.ts +2 -0
- package/build/test/test-parse-wordlist.d.ts.map +1 -0
- package/build/test/test-parse-wordlist.js +110 -0
- package/build/test/test-parse-wordlist.js.map +1 -0
- package/build/test/test-punctuation.d.ts +2 -0
- package/build/test/test-punctuation.d.ts.map +1 -0
- package/build/test/test-punctuation.js +31 -0
- package/build/test/test-punctuation.js.map +1 -0
- package/build/test/tsconfig.tsbuildinfo +1 -0
- package/build/test/wordbreakers/data.d.ts +35 -0
- package/build/test/wordbreakers/data.d.ts.map +1 -0
- package/build/test/wordbreakers/data.js +1778 -0
- package/build/test/wordbreakers/data.js.map +1 -0
- package/build/test/wordbreakers/default-wordbreaker-esm.d.ts +10 -0
- package/build/test/wordbreakers/default-wordbreaker-esm.d.ts.map +1 -0
- package/build/test/wordbreakers/default-wordbreaker-esm.js +354 -0
- package/build/test/wordbreakers/default-wordbreaker-esm.js.map +1 -0
- package/build/tsconfig.tsbuildinfo +1 -0
- package/build.sh +73 -0
- package/coverage/lcov-report/base.css +224 -0
- package/coverage/lcov-report/block-navigation.js +87 -0
- package/coverage/lcov-report/favicon.png +0 -0
- package/coverage/lcov-report/index.html +161 -0
- package/coverage/lcov-report/prettify.css +1 -0
- package/coverage/lcov-report/prettify.js +2 -0
- package/coverage/lcov-report/sort-arrow-sprite.png +0 -0
- package/coverage/lcov-report/sorter.js +196 -0
- package/coverage/lcov-report/src/build-trie.ts.html +1618 -0
- package/coverage/lcov-report/src/index.html +221 -0
- package/coverage/lcov-report/src/join-word-breaker-decorator.ts.html +487 -0
- package/coverage/lcov-report/src/lexical-model-compiler.ts.html +622 -0
- package/coverage/lcov-report/src/main.ts.html +271 -0
- package/coverage/lcov-report/src/model-compiler-errors.ts.html +691 -0
- package/coverage/lcov-report/src/model-defaults.ts.html +415 -0
- package/coverage/lcov-report/src/model-definitions.ts.html +748 -0
- package/coverage/lcov-report/src/script-overrides-decorator.ts.html +310 -0
- package/coverage/lcov-report/test/helpers/index.html +116 -0
- package/coverage/lcov-report/test/helpers/index.ts.html +646 -0
- package/coverage/lcov-report/test/index.html +266 -0
- package/coverage/lcov-report/test/test-compile-model-with-pseudoclosure.ts.html +802 -0
- package/coverage/lcov-report/test/test-compile-model.ts.html +187 -0
- package/coverage/lcov-report/test/test-compile-trie.ts.html +541 -0
- package/coverage/lcov-report/test/test-default-apply-case.ts.html +466 -0
- package/coverage/lcov-report/test/test-default-search-term-to-key.ts.html +628 -0
- package/coverage/lcov-report/test/test-error-logger.ts.html +196 -0
- package/coverage/lcov-report/test/test-join-word-breaker.ts.html +376 -0
- package/coverage/lcov-report/test/test-model-definitions.ts.html +676 -0
- package/coverage/lcov-report/test/test-override-script-defaults.ts.html +184 -0
- package/coverage/lcov-report/test/test-parse-wordlist.ts.html +466 -0
- package/coverage/lcov-report/test/test-punctuation.ts.html +190 -0
- package/coverage/lcov-report/test/wordbreakers/data.ts.html +5413 -0
- package/coverage/lcov-report/test/wordbreakers/default-wordbreaker-esm.ts.html +1234 -0
- package/coverage/lcov-report/test/wordbreakers/index.html +131 -0
- package/coverage/lcov.info +5969 -0
- package/package.json +61 -0
- package/src/build-trie.ts +511 -0
- package/src/join-word-breaker-decorator.ts +134 -0
- package/src/lexical-model-compiler.ts +179 -0
- package/src/lexical-model.ts +150 -0
- package/src/main.ts +62 -0
- package/src/model-compiler-errors.ts +203 -0
- package/src/model-defaults.ts +111 -0
- package/src/model-definitions.ts +222 -0
- package/src/script-overrides-decorator.ts +75 -0
- package/test/README.md +15 -0
- package/test/fixtures/example.qaa.joinwordbreaker/example.qaa.joinwordbreaker.model.ts +10 -0
- package/test/fixtures/example.qaa.joinwordbreaker/wordlist.tsv +3 -0
- package/test/fixtures/example.qaa.scriptusesspaces/example.qaa.scriptusesspaces.model.ts +10 -0
- package/test/fixtures/example.qaa.scriptusesspaces/wordlist.tsv +8 -0
- package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.kmp.json +45 -0
- package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.kps +35 -0
- package/test/fixtures/example.qaa.sencoten/example.qaa.sencoten.model.ts +6 -0
- package/test/fixtures/example.qaa.sencoten/wordlist.tsv +10 -0
- package/test/fixtures/example.qaa.smp/example.qaa.smp.model.ts +6 -0
- package/test/fixtures/example.qaa.smp/wordlist.tsv +5 -0
- package/test/fixtures/example.qaa.trivial/example.qaa.trivial.model.ts +5 -0
- package/test/fixtures/example.qaa.trivial/wordlist.tsv +3 -0
- package/test/fixtures/example.qaa.utf16be/example.qaa.utf16be.model.ts +5 -0
- package/test/fixtures/example.qaa.utf16be/wordlist.txt +0 -0
- package/test/fixtures/example.qaa.utf16le/example.qaa.utf16le.model.ts +5 -0
- package/test/fixtures/example.qaa.utf16le/wordlist.txt +0 -0
- package/test/fixtures/example.qaa.wordbreaker/example.qaa.wordbreaker.model.ts +9 -0
- package/test/fixtures/example.qaa.wordbreaker/wordlist.tsv +3 -0
- package/test/helpers/index.ts +187 -0
- package/test/test-compile-model-with-pseudoclosure.ts +239 -0
- package/test/test-compile-model.ts +34 -0
- package/test/test-compile-trie.ts +152 -0
- package/test/test-default-apply-case.ts +128 -0
- package/test/test-default-search-term-to-key.ts +181 -0
- package/test/test-error-logger.ts +38 -0
- package/test/test-join-word-breaker.ts +97 -0
- package/test/test-model-definitions.ts +198 -0
- package/test/test-override-script-defaults.ts +33 -0
- package/test/test-parse-wordlist.ts +127 -0
- package/test/test-punctuation.ts +35 -0
- package/test/tsconfig.json +22 -0
- package/test/wordbreakers/README.md +3 -0
- package/test/wordbreakers/data.ts +1776 -0
- package/test/wordbreakers/default-wordbreaker-esm.ts +383 -0
- package/tools/create-override-script-regexp.ts +145 -0
- package/tsconfig.json +17 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
|
|
2
|
+
import 'mocha';
|
|
3
|
+
import { assert } from 'chai';
|
|
4
|
+
import { defaultApplyCasing } from '../src/model-defaults.js';
|
|
5
|
+
|
|
6
|
+
describe('The default applyCasing() function', function () {
|
|
7
|
+
// // --------
|
|
8
|
+
// // Definitions toward SMP testing.
|
|
9
|
+
// // Ref: https://unicode.org/charts/nameslist/n_1D400.html
|
|
10
|
+
// // Useful for tests related to strings with supplementary pairs.
|
|
11
|
+
// let u = function(code: number): string {
|
|
12
|
+
// var H = Math.floor((code - 0x10000) / 0x400) + 0xD800;
|
|
13
|
+
// var L = (code - 0x10000) % 0x400 + 0xDC00;
|
|
14
|
+
|
|
15
|
+
// return String.fromCharCode(H, L);
|
|
16
|
+
// }
|
|
17
|
+
|
|
18
|
+
// Are there any known default-handled SMP cases?
|
|
19
|
+
// If not... we could make the defaultApplyCase function simpler by not worrying about SMP.
|
|
20
|
+
//
|
|
21
|
+
// let smp_a = u(0x1d5ba); // MATHEMATICAL SANS-SERIF SMALL A
|
|
22
|
+
// let smp_p = u(0x1d5c9);
|
|
23
|
+
// let smp_l = u(0x1d5c5);
|
|
24
|
+
// let smp_e = u(0x1d5be);
|
|
25
|
+
|
|
26
|
+
// let smp_A = u(0x1d5a0); // MATHEMATICAL SANS_SERIF CAPITAL A
|
|
27
|
+
// let smp_P = u(0x1d5af);
|
|
28
|
+
// let smp_L = u(0x1d5ab);
|
|
29
|
+
// let smp_E = u(0x1d5a4);
|
|
30
|
+
|
|
31
|
+
// // Unfortunately... the default JS .toUpperCase() implementation doesn't actually
|
|
32
|
+
// // map the 'SMALL' versions to the 'CAPITAL' versions.
|
|
33
|
+
// // ---------
|
|
34
|
+
|
|
35
|
+
describe('case: \'lower\'', function() {
|
|
36
|
+
const testCases: [string, string][] = [
|
|
37
|
+
// Note: not written the Turkish way. Turns out 'İ'.toLowerCase() decomposes the result,
|
|
38
|
+
// which would have made for a fairly yucky test.
|
|
39
|
+
['Istanbul', 'istanbul'],
|
|
40
|
+
|
|
41
|
+
// The DEFAULT function is NOT responsible for understanding the Turkish
|
|
42
|
+
// case regarding the lowercasing of:
|
|
43
|
+
// 'I' U+0048 LATIN CAPITAL LETTER I to 'ı' U+0131 LATIN SMALL LETTER DOTLESS I
|
|
44
|
+
// For Turkic languages, the recommendation is to make a
|
|
45
|
+
// custom applyCasing function:
|
|
46
|
+
['DİYARBAKIR', 'di̇yarbakir'], // The 'i̇' is the decomposed result alluded to for the previous case.
|
|
47
|
+
|
|
48
|
+
// full-width romaji has corresponding capitalized versions:
|
|
49
|
+
['AESTHETIC', 'aesthetic'],
|
|
50
|
+
|
|
51
|
+
// "skýlos" is Greek for dog 🇬🇷🐶
|
|
52
|
+
// starts with an 's' and ends with an 's'
|
|
53
|
+
// which are DIFFERENT CHARACTERS in lowercased Greek!
|
|
54
|
+
['ΣΚΥΛΟΣ', 'σκυλος'],
|
|
55
|
+
|
|
56
|
+
// Uncased syntax and numbers should pass through unscathed:
|
|
57
|
+
['1234.?!', '1234.?!']
|
|
58
|
+
];
|
|
59
|
+
|
|
60
|
+
for (let [input, expected] of testCases) {
|
|
61
|
+
it(`should lowercase '${input}' as '${expected}'`, function() {
|
|
62
|
+
assert.equal(defaultApplyCasing('lower', input), expected);
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
describe('case: \'upper\'', function() {
|
|
68
|
+
const testCases: [string, string][] = [
|
|
69
|
+
// Inverse of the corresponding 'lower' test.
|
|
70
|
+
['istanbul', 'ISTANBUL'],
|
|
71
|
+
|
|
72
|
+
// The DEFAULT function is NOT responsible for understanding the Turkish
|
|
73
|
+
// case regarding the uppercasing of:
|
|
74
|
+
// 'ı' U+0131 LATIN SMALL LETTER DOTLESS I to 'I' U+0048 LATIN CAPITAL LETTER I
|
|
75
|
+
// For Turkic languages, the recommendation is to make a
|
|
76
|
+
// custom applyCasing function:
|
|
77
|
+
['diyarbakır', 'DIYARBAKIR'], // The 'i̇' is the decomposed result alluded to for the previous case.
|
|
78
|
+
|
|
79
|
+
// full-width romaji has corresponding capitalized versions:
|
|
80
|
+
['aesthetic', 'AESTHETIC'],
|
|
81
|
+
|
|
82
|
+
// "skýlos" is Greek for dog 🇬🇷🐶
|
|
83
|
+
// starts with an 's' and ends with an 's'
|
|
84
|
+
// which are DIFFERENT CHARACTERS in lowercased Greek!
|
|
85
|
+
['σκυλος', 'ΣΚΥΛΟΣ'],
|
|
86
|
+
|
|
87
|
+
// Uncased syntax and numbers should pass through unscathed:
|
|
88
|
+
['1234.?!', '1234.?!']
|
|
89
|
+
];
|
|
90
|
+
|
|
91
|
+
for (let [input, expected] of testCases) {
|
|
92
|
+
it(`should uppercase '${input}' as '${expected}'`, function() {
|
|
93
|
+
assert.equal(defaultApplyCasing('upper', input), expected);
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
describe('case: \'initial\'', function() {
|
|
99
|
+
const testCases: [string, string][] = [
|
|
100
|
+
// Inverse of the corresponding 'lower' test.
|
|
101
|
+
['istanbul', 'Istanbul'],
|
|
102
|
+
|
|
103
|
+
// The DEFAULT function is NOT responsible for understanding the Turkish
|
|
104
|
+
// case regarding the uppercasing of:
|
|
105
|
+
// 'ı' U+0131 LATIN SMALL LETTER DOTLESS I to 'I' U+0048 LATIN CAPITAL LETTER I
|
|
106
|
+
// For Turkic languages, the recommendation is to make a
|
|
107
|
+
// custom applyCasing function:
|
|
108
|
+
['diyarbakır', 'Diyarbakır'], // The 'i̇' is the decomposed result alluded to for the previous case.
|
|
109
|
+
|
|
110
|
+
// full-width romaji has corresponding capitalized versions:
|
|
111
|
+
['aesthetic', 'Aesthetic'],
|
|
112
|
+
|
|
113
|
+
// "skýlos" is Greek for dog 🇬🇷🐶
|
|
114
|
+
// starts with an 's' and ends with an 's'
|
|
115
|
+
// which are DIFFERENT CHARACTERS in lowercased Greek!
|
|
116
|
+
['σκυλος', 'Σκυλος'],
|
|
117
|
+
|
|
118
|
+
// Uncased syntax and numbers should pass through unscathed:
|
|
119
|
+
['1234.?!', '1234.?!']
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
for (let [input, expected] of testCases) {
|
|
123
|
+
it(`should initial-case '${input}' as '${expected}'`, function() {
|
|
124
|
+
assert.equal(defaultApplyCasing('initial', input), expected);
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
});
|
|
128
|
+
});
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
|
|
2
|
+
import 'mocha';
|
|
3
|
+
import {assert} from 'chai';
|
|
4
|
+
import { defaultSearchTermToKey,
|
|
5
|
+
defaultCasedSearchTermToKey,
|
|
6
|
+
defaultApplyCasing } from '../src/model-defaults.js';
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
describe('The default searchTermToKey() function', function () {
|
|
10
|
+
describe('languageUsesCasing: false', function() {
|
|
11
|
+
const testCases: [string, string][] = [
|
|
12
|
+
// "İstanbul" has a U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE.
|
|
13
|
+
// Without any casing operations, only the I should be altered.
|
|
14
|
+
['İstanbul', 'Istanbul'],
|
|
15
|
+
|
|
16
|
+
// Similarly...
|
|
17
|
+
['DİYARBAKIR', 'DIYARBAKIR'],
|
|
18
|
+
|
|
19
|
+
// "skýlos" is Greek for dog 🇬🇷🐶
|
|
20
|
+
// starts with an 's' and ends with an 's'
|
|
21
|
+
// which are DIFFERENT CHARACTERS in lowercased Greek!
|
|
22
|
+
['σκύλος', 'σκυλος'],
|
|
23
|
+
['ΣΚΥΛΟΣ', 'ΣΚΥΛΟΣ'],
|
|
24
|
+
|
|
25
|
+
// full-width romaji is compatibility-canonical with ASCII characters:
|
|
26
|
+
['aesthetic', 'aesthetic'],
|
|
27
|
+
|
|
28
|
+
// U+212B ANGSTROM SIGN (Å)
|
|
29
|
+
// U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE (Å)
|
|
30
|
+
// and should both normalize to 'A'
|
|
31
|
+
['\u212B', 'A'],
|
|
32
|
+
['\u00C5', 'A'],
|
|
33
|
+
|
|
34
|
+
// We should not fall for U+037E GREEK QUESTION MARK's trolling:
|
|
35
|
+
['\u037e', ';'],
|
|
36
|
+
|
|
37
|
+
// Test presentational forms of Arabic:
|
|
38
|
+
// U+FE8D ARABIC LETTER ALEF ISOLATED FORM -> U+0627 ARABIC LETTER ALEF
|
|
39
|
+
// U+FEDF ARABIC LETTER LAM INITIAL FORM -> U+0644 ARABIC LETTER LAM
|
|
40
|
+
// U+FED8 ARABIC LETTER QAF MEDIAL FORM -> U+0642 ARABIC LETTER QAF
|
|
41
|
+
// U+FEEC ARABIC LETTER HEH MEDIAL FORM -> U+0647 ARABIC LETTER HEH
|
|
42
|
+
// U+FEEE ARABIC LETTER WAW FINAL FORM -> U+0648 ARABIC LETTER WAW
|
|
43
|
+
// U+FE93 ARABIC LETTER TEH MARBUTA ISOLATED FORM -> U+0629 ARABIC LETTER TEH MARBUTA
|
|
44
|
+
['\uFE8D\uFEDF\uFED8\uFEEC\uFEEE\uFE93', '\u0627\u0644\u0642\u0647\u0648\u0629'],
|
|
45
|
+
|
|
46
|
+
// Combine both NFKD **AND** knocking off diacritics:
|
|
47
|
+
// U+01C4 LATIN CAPITAL LETTER DZ WITH CARON (DŽ) -> <U+0064, U+007A> (dz)
|
|
48
|
+
['DŽ', 'DZ'],
|
|
49
|
+
];
|
|
50
|
+
|
|
51
|
+
for (let [input, expected] of testCases) {
|
|
52
|
+
it(`should normalize '${input}' to '${expected}'`, function() {
|
|
53
|
+
assert.equal(defaultSearchTermToKey(input), expected);
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
describe('languageUsesCasing: true (custom applyCasing, inverts lower- & upper- casing)', function() {
|
|
59
|
+
const testCases: [string, string][] = [
|
|
60
|
+
// The DEFAULT function is NOT responsible for understanding the Turkish
|
|
61
|
+
// distinction between U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and
|
|
62
|
+
// 'I' U+0048 LATIN CAPITAL LETTER I.
|
|
63
|
+
// For Turkic languages, the recommendation is thus to make a
|
|
64
|
+
// custom searchTermToKey function.
|
|
65
|
+
['İstanbul', 'ISTANBUL'],
|
|
66
|
+
['DİYARBAKIR', 'DIYARBAKIR'],
|
|
67
|
+
|
|
68
|
+
// "skýlos" is Greek for dog 🇬🇷🐶
|
|
69
|
+
// starts with an 's' and ends with an 's'
|
|
70
|
+
// which are DIFFERENT CHARACTERS in lowercased Greek!
|
|
71
|
+
['σκύλος', 'ΣΚΥΛΟΣ'],
|
|
72
|
+
['σκυλοσ', 'ΣΚΥΛΟΣ'],
|
|
73
|
+
|
|
74
|
+
// full-width romaji is compatibility-canonical with ASCII characters:
|
|
75
|
+
['aesthetic', 'AESTHETIC'],
|
|
76
|
+
|
|
77
|
+
// U+212B ANGSTROM SIGN (Å)
|
|
78
|
+
// U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE (Å)
|
|
79
|
+
// and should both normalize to 'a'
|
|
80
|
+
['\u212B', 'A'],
|
|
81
|
+
['\u00C5', 'A'],
|
|
82
|
+
|
|
83
|
+
// We should not fall for U+037E GREEK QUESTION MARK's trolling:
|
|
84
|
+
['\u037e', ';'],
|
|
85
|
+
|
|
86
|
+
// Test presentational forms of Arabic:
|
|
87
|
+
// U+FE8D ARABIC LETTER ALEF ISOLATED FORM -> U+0627 ARABIC LETTER ALEF
|
|
88
|
+
// U+FEDF ARABIC LETTER LAM INITIAL FORM -> U+0644 ARABIC LETTER LAM
|
|
89
|
+
// U+FED8 ARABIC LETTER QAF MEDIAL FORM -> U+0642 ARABIC LETTER QAF
|
|
90
|
+
// U+FEEC ARABIC LETTER HEH MEDIAL FORM -> U+0647 ARABIC LETTER HEH
|
|
91
|
+
// U+FEEE ARABIC LETTER WAW FINAL FORM -> U+0648 ARABIC LETTER WAW
|
|
92
|
+
// U+FE93 ARABIC LETTER TEH MARBUTA ISOLATED FORM -> U+0629 ARABIC LETTER TEH MARBUTA
|
|
93
|
+
['\uFE8D\uFEDF\uFED8\uFEEC\uFEEE\uFE93', '\u0627\u0644\u0642\u0647\u0648\u0629'],
|
|
94
|
+
|
|
95
|
+
// Combine both NFKD **AND** knocking off diacritics:
|
|
96
|
+
// U+01C4 LATIN CAPITAL LETTER DZ WITH CARON (DŽ) -> <U+0064, U+007A> (dz)
|
|
97
|
+
['DŽ', 'DZ'],
|
|
98
|
+
];
|
|
99
|
+
|
|
100
|
+
// While a Turkish-based test would be nice, Turkish needs custom keying,
|
|
101
|
+
// as U+0130's default handling is... not ideal in Turkish.
|
|
102
|
+
//
|
|
103
|
+
// Instead, we can get a simple-enough test with inverted casing.
|
|
104
|
+
let customCasing = function(caseToApply: CasingForm,
|
|
105
|
+
text: string,
|
|
106
|
+
defaultApplyCasing: CasingFunction): string {
|
|
107
|
+
switch(caseToApply) {
|
|
108
|
+
case 'lower':
|
|
109
|
+
return text.toUpperCase();
|
|
110
|
+
case 'upper':
|
|
111
|
+
return text.toLowerCase();
|
|
112
|
+
case 'initial':
|
|
113
|
+
return customCasing('upper', text.charAt(0), defaultApplyCasing) + text.substr(1);
|
|
114
|
+
default:
|
|
115
|
+
return text;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
let customCasingClosure = function(caseToApply: CasingForm, text: string): string {
|
|
120
|
+
return customCasing(caseToApply, text, defaultApplyCasing);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
for (let [input, expected] of testCases) {
|
|
124
|
+
it(`should normalize '${input}' to '${expected}'`, function() {
|
|
125
|
+
assert.equal(defaultCasedSearchTermToKey(input, customCasingClosure as CasingFunction), expected);
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
describe('languageUsesCasing: undefined (legacy 12.0 / 13.0 mode, uses `defaultApplyCasing`)', function() {
|
|
131
|
+
const testCases: [string, string][] = [
|
|
132
|
+
// "İstanbul" has a U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE.
|
|
133
|
+
// This should be lowercased.
|
|
134
|
+
['İstanbul', 'istanbul'],
|
|
135
|
+
|
|
136
|
+
// The DEFAULT function is NOT responsible for understanding the Turkish
|
|
137
|
+
// case regarding the lowercasing of:
|
|
138
|
+
// 'I' U+0048 LATIN CAPITAL LETTER I to 'ı' U+0131 LATIN SMALL LETTER DOTLESS I
|
|
139
|
+
// For Turkic languages, the recommendation is to make a
|
|
140
|
+
// custom searchTermToKey function:
|
|
141
|
+
['DİYARBAKIR', 'diyarbakir'],
|
|
142
|
+
|
|
143
|
+
// "skýlos" is Greek for dog 🇬🇷🐶
|
|
144
|
+
// starts with an 's' and ends with an 's'
|
|
145
|
+
// which are DIFFERENT CHARACTERS in lowercased Greek!
|
|
146
|
+
['σκύλος', 'σκυλος'],
|
|
147
|
+
['ΣΚΥΛΟΣ', 'σκυλοσ'],
|
|
148
|
+
|
|
149
|
+
// full-width romaji is compatibility-canonical with ASCII characters:
|
|
150
|
+
['aesthetic', 'aesthetic'],
|
|
151
|
+
|
|
152
|
+
// U+212B ANGSTROM SIGN (Å)
|
|
153
|
+
// U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE (Å)
|
|
154
|
+
// and should both normalize to 'a'
|
|
155
|
+
['\u212B', 'a'],
|
|
156
|
+
['\u00C5', 'a'],
|
|
157
|
+
|
|
158
|
+
// We should not fall for U+037E GREEK QUESTION MARK's trolling:
|
|
159
|
+
['\u037e', ';'],
|
|
160
|
+
|
|
161
|
+
// Test presentational forms of Arabic:
|
|
162
|
+
// U+FE8D ARABIC LETTER ALEF ISOLATED FORM -> U+0627 ARABIC LETTER ALEF
|
|
163
|
+
// U+FEDF ARABIC LETTER LAM INITIAL FORM -> U+0644 ARABIC LETTER LAM
|
|
164
|
+
// U+FED8 ARABIC LETTER QAF MEDIAL FORM -> U+0642 ARABIC LETTER QAF
|
|
165
|
+
// U+FEEC ARABIC LETTER HEH MEDIAL FORM -> U+0647 ARABIC LETTER HEH
|
|
166
|
+
// U+FEEE ARABIC LETTER WAW FINAL FORM -> U+0648 ARABIC LETTER WAW
|
|
167
|
+
// U+FE93 ARABIC LETTER TEH MARBUTA ISOLATED FORM -> U+0629 ARABIC LETTER TEH MARBUTA
|
|
168
|
+
['\uFE8D\uFEDF\uFED8\uFEEC\uFEEE\uFE93', '\u0627\u0644\u0642\u0647\u0648\u0629'],
|
|
169
|
+
|
|
170
|
+
// Combine both NFKD **AND** knocking off diacritics:
|
|
171
|
+
// U+01C4 LATIN CAPITAL LETTER DZ WITH CARON (DŽ) -> <U+0064, U+007A> (dz)
|
|
172
|
+
['DŽ', 'dz'],
|
|
173
|
+
];
|
|
174
|
+
|
|
175
|
+
for (let [input, expected] of testCases) {
|
|
176
|
+
it(`should normalize '${input}' to '${expected}'`, function() {
|
|
177
|
+
assert.equal(defaultCasedSearchTermToKey(input, defaultApplyCasing), expected);
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
});
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { MAX_MESSAGES, KeymanCompilerError, log } from "../src/model-compiler-errors.js";
|
|
2
|
+
import { LogHoarder } from "./helpers/index.js";
|
|
3
|
+
import { assert } from "chai";
|
|
4
|
+
|
|
5
|
+
describe('error logger', function () {
|
|
6
|
+
beforeEach(function () {
|
|
7
|
+
this.logHoarder = (new LogHoarder).install()
|
|
8
|
+
})
|
|
9
|
+
|
|
10
|
+
afterEach(function () {
|
|
11
|
+
this.logHoarder.uninstall();
|
|
12
|
+
delete this.logHoarder;
|
|
13
|
+
})
|
|
14
|
+
|
|
15
|
+
it('should stop logging messages **after** a maximum', function () {
|
|
16
|
+
for (let i = 0; i < MAX_MESSAGES; i++) {
|
|
17
|
+
log(KeymanCompilerError.CWARN_DuplicateWordInSameFile, "fake error");
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// We've logged *just enough messages. This error should not be found:
|
|
21
|
+
assert.isFalse(this.logHoarder.hasSeenCode(
|
|
22
|
+
KeymanCompilerError.CWARN_TooManyErrorsOrWarnings
|
|
23
|
+
));
|
|
24
|
+
|
|
25
|
+
// Log just one too many:
|
|
26
|
+
log(KeymanCompilerError.CWARN_DuplicateWordInSameFile, "fake error");
|
|
27
|
+
|
|
28
|
+
assert.isTrue(this.logHoarder.hasSeenCode(
|
|
29
|
+
KeymanCompilerError.CWARN_TooManyErrorsOrWarnings
|
|
30
|
+
));
|
|
31
|
+
|
|
32
|
+
// Log a DIFFERENT error -- it should not appear in the log
|
|
33
|
+
log(KeymanCompilerError.CWARN_MixedNormalizationForms, "fake error");
|
|
34
|
+
assert.isFalse(this.logHoarder.hasSeenCode(
|
|
35
|
+
KeymanCompilerError.CWARN_MixedNormalizationForms
|
|
36
|
+
));
|
|
37
|
+
})
|
|
38
|
+
})
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { assert } from "chai";
|
|
2
|
+
import defaultWordBreaker from './wordbreakers/default-wordbreaker-esm.js';
|
|
3
|
+
import {decorateWithJoin} from '../src/join-word-breaker-decorator.js';
|
|
4
|
+
|
|
5
|
+
describe('The join word breaker decorator', function () {
|
|
6
|
+
it('should decorate an existing word breaker', function () {
|
|
7
|
+
let breakWords = decorateWithJoin(defaultWordBreaker, ['-']);
|
|
8
|
+
assert.isFunction(breakWords);
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
const TEST_CASES: [string, string[], string[], string[]][] = [
|
|
12
|
+
/* input, joiners, default breaks, breaks with joins */
|
|
13
|
+
// Original test case from https://github.com/keymanapp/keyman/issues/2753
|
|
14
|
+
['khui-chhùi', ['-'], ["khui", "-", "chhùi"], ["khui-chhùi"]],
|
|
15
|
+
|
|
16
|
+
// Plains Cree SRO:
|
|
17
|
+
['ê-kotiskâwêyâhk', ['-'], ['ê', '-', 'kotiskâwêyâhk'], ['ê-kotiskâwêyâhk']],
|
|
18
|
+
|
|
19
|
+
// Edge cases:
|
|
20
|
+
|
|
21
|
+
// Joiner alone:
|
|
22
|
+
['-', ['-'], ['-'], ['-']],
|
|
23
|
+
// Joiner at the end:
|
|
24
|
+
['ni-', ['-'], ['ni', '-'], ['ni-']],
|
|
25
|
+
// Joiner at the end:
|
|
26
|
+
['-ân', ['-'], ['-', 'ân'], ['-ân']],
|
|
27
|
+
|
|
28
|
+
// This was my guiding test case:
|
|
29
|
+
[
|
|
30
|
+
"-yâhk ê-nitawi-kotiskâwêyâhk ni-",
|
|
31
|
+
["-"],
|
|
32
|
+
["-", "yâhk", "ê", "-", "nitawi", "-", "kotiskâwêyâhk", "ni", "-"],
|
|
33
|
+
["-yâhk", "ê-nitawi-kotiskâwêyâhk", "ni-"]
|
|
34
|
+
],
|
|
35
|
+
|
|
36
|
+
// Do not perform any joins:
|
|
37
|
+
["hello world", ["-"], ["hello", "world"], ["hello", "world"]],
|
|
38
|
+
|
|
39
|
+
// Joining using multiple joiners
|
|
40
|
+
[
|
|
41
|
+
"Email: no-body@example.com",
|
|
42
|
+
["@", "-"],
|
|
43
|
+
["Email", ":", "no", "-", "body", "@", "example.com"],
|
|
44
|
+
["Email", ":", "no-body@example.com"]
|
|
45
|
+
],
|
|
46
|
+
|
|
47
|
+
// Joining with two or more joiners in a row
|
|
48
|
+
[
|
|
49
|
+
"nobody@@example.com",
|
|
50
|
+
["@"],
|
|
51
|
+
["nobody", "@", "@", "example.com"],
|
|
52
|
+
["nobody@@example.com"]
|
|
53
|
+
],
|
|
54
|
+
|
|
55
|
+
// it should NOT join non-contiguous spans:
|
|
56
|
+
[
|
|
57
|
+
"this- is -bad",
|
|
58
|
+
["-"],
|
|
59
|
+
["this", "-", "is", "-", "bad"],
|
|
60
|
+
["this-", "is", "-bad"]
|
|
61
|
+
],
|
|
62
|
+
|
|
63
|
+
// different but adjacent joiners
|
|
64
|
+
[
|
|
65
|
+
"I made the kawé:-conjugator.",
|
|
66
|
+
["-", ":"],
|
|
67
|
+
["I", "made", "the", "kawé", ":", "-", "conjugator", "."],
|
|
68
|
+
["I", "made", "the", "kawé:-conjugator", "."]
|
|
69
|
+
],
|
|
70
|
+
|
|
71
|
+
// 3+ joiners in a row
|
|
72
|
+
[
|
|
73
|
+
// NB: – is U+2001 EN DASH
|
|
74
|
+
"This language is nut–=💠¤~ty!",
|
|
75
|
+
["~", "–", "¤", "=", "💠"],
|
|
76
|
+
["This", "language", "is", "nut", "–", "=", "💠", "¤", "~", "ty", "!"],
|
|
77
|
+
["This", "language", "is", "nut–=💠¤~ty", "!"],
|
|
78
|
+
],
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
for (let [phrase, joiners, unjoined, expected] of TEST_CASES) {
|
|
82
|
+
it(`should break «${[phrase]}» as [${expected.join(' ;; ')}]`, function () {
|
|
83
|
+
let breakWords = decorateWithJoin(defaultWordBreaker, joiners);
|
|
84
|
+
let unjoinedResult = defaultWordBreaker(phrase).map(onlyText);
|
|
85
|
+
assert.deepEqual(unjoinedResult, unjoined);
|
|
86
|
+
let actualResult = breakWords(phrase).map(onlyText);
|
|
87
|
+
assert.deepEqual(actualResult, expected);
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Get just the text from a span.
|
|
93
|
+
*/
|
|
94
|
+
function onlyText(span: Span) {
|
|
95
|
+
return span.text;
|
|
96
|
+
}
|
|
97
|
+
});
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import 'mocha';
|
|
2
|
+
import { assert } from 'chai';
|
|
3
|
+
import { ModelDefinitions } from '../src/model-definitions.js';
|
|
4
|
+
import { LexicalModelSource } from '../src/lexical-model.js';
|
|
5
|
+
|
|
6
|
+
describe('Model definition pseudoclosures', function () {
|
|
7
|
+
describe('14.0 defaults', function() {
|
|
8
|
+
describe('languageUsesCasing == true', function() {
|
|
9
|
+
// We don't need a complete spec for this, given the (currently) limited range of what
|
|
10
|
+
// the ModelPseudoclosure covers.
|
|
11
|
+
let modelSource: LexicalModelSource = {
|
|
12
|
+
languageUsesCasing: true,
|
|
13
|
+
sources: [],
|
|
14
|
+
format: 'trie-1.0'
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
let pseudoclosure = new ModelDefinitions(modelSource);
|
|
18
|
+
|
|
19
|
+
const testCases: [string, string, string][] = [
|
|
20
|
+
// Note: not written the Turkish way. Turns out 'İ'.toLowerCase() decomposes the result,
|
|
21
|
+
// which would have made for a fairly yucky test.
|
|
22
|
+
['Istanbul', 'istanbul', 'istanbul'],
|
|
23
|
+
|
|
24
|
+
// The DEFAULT function is NOT responsible for understanding the Turkish
|
|
25
|
+
// case regarding the lowercasing of:
|
|
26
|
+
// 'I' U+0048 LATIN CAPITAL LETTER I to 'ı' U+0131 LATIN SMALL LETTER DOTLESS I
|
|
27
|
+
// For Turkic languages, the recommendation is to make a
|
|
28
|
+
// custom applyCasing function:
|
|
29
|
+
['DİYARBAKIR', 'di̇yarbakir', 'diyarbakir'], // The 'i̇' is the decomposed result alluded to for the previous case.
|
|
30
|
+
|
|
31
|
+
// full-width romaji has corresponding lowercased versions:
|
|
32
|
+
['AESTHETIC', 'aesthetic', 'aesthetic'],
|
|
33
|
+
|
|
34
|
+
// "skýlos" is Greek for dog 🇬🇷🐶
|
|
35
|
+
// starts with an 's' and ends with an 's'
|
|
36
|
+
// which are DIFFERENT CHARACTERS in lowercased Greek!
|
|
37
|
+
['σκύλος', 'σκύλος', 'σκυλος'],
|
|
38
|
+
['ΣΚΥΛΟΣ', 'σκυλος', 'σκυλοσ'], // the keyed version after lowercasing doesn't know how
|
|
39
|
+
// to make the distinction. Both 'Σ's have the same char-code.
|
|
40
|
+
|
|
41
|
+
// Uncased syntax and numbers should pass through unscathed:
|
|
42
|
+
['1234.?!', '1234.?!', '1234.?!'],
|
|
43
|
+
['”', '”', '"'],
|
|
44
|
+
["‘", "‘", "'"]
|
|
45
|
+
];
|
|
46
|
+
|
|
47
|
+
for (let [input, cased, keyed] of testCases) {
|
|
48
|
+
it(`should case '${input}' as '${cased}'`, function() {
|
|
49
|
+
assert.equal(pseudoclosure.applyCasing('lower', input), cased);
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it(`should key '${input}' as '${keyed}'`, function() {
|
|
53
|
+
assert.equal(pseudoclosure.searchTermToKey(input), keyed);
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
describe('languageUsesCasing == false', function() {
|
|
59
|
+
// We don't need a complete spec for this, given the (currently) limited range of what
|
|
60
|
+
// the ModelPseudoclosure covers.
|
|
61
|
+
let modelSource: LexicalModelSource = {
|
|
62
|
+
languageUsesCasing: false,
|
|
63
|
+
sources: [],
|
|
64
|
+
format: 'trie-1.0'
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
let pseudoclosure = new ModelDefinitions(modelSource);
|
|
68
|
+
|
|
69
|
+
const testCases: [string, string][] = [
|
|
70
|
+
// Note: not written the Turkish way. Turns out 'İ'.toLowerCase() decomposes the result,
|
|
71
|
+
// which would have made for a fairly yucky test.
|
|
72
|
+
['Istanbul', 'Istanbul'],
|
|
73
|
+
|
|
74
|
+
['DİYARBAKIR', 'DIYARBAKIR'],
|
|
75
|
+
|
|
76
|
+
// full-width romaji has corresponding capitalized versions:
|
|
77
|
+
['AESTHETIC', 'AESTHETIC'],
|
|
78
|
+
|
|
79
|
+
// "skýlos" is Greek for dog 🇬🇷🐶
|
|
80
|
+
// starts with an 's' and ends with an 's'
|
|
81
|
+
// which are DIFFERENT CHARACTERS in lowercased Greek!
|
|
82
|
+
['σκύλος', 'σκυλος'],
|
|
83
|
+
['ΣΚΥΛΟΣ', 'ΣΚΥΛΟΣ'], // the keyed version after lowercasing doesn't know how
|
|
84
|
+
// to make the distinction. Both 'Σ's have the same char-code.
|
|
85
|
+
|
|
86
|
+
// Uncased syntax and numbers should pass through unscathed:
|
|
87
|
+
['1234.?!', '1234.?!'],
|
|
88
|
+
|
|
89
|
+
['”', '"'],
|
|
90
|
+
["‘", "'"]
|
|
91
|
+
];
|
|
92
|
+
|
|
93
|
+
for (let [input, keyed] of testCases) {
|
|
94
|
+
it(`should key '${input}' as '${keyed}'`, function() {
|
|
95
|
+
assert.equal(pseudoclosure.searchTermToKey(input), keyed);
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
});
|
|
100
|
+
describe('Pre 14.0 defaults (languageUsesCasing == undefined)', function() {
|
|
101
|
+
// We don't need a complete spec for this, given the (currently) limited range of what
|
|
102
|
+
// the ModelPseudoclosure covers.
|
|
103
|
+
let modelSource: LexicalModelSource = {
|
|
104
|
+
sources: [],
|
|
105
|
+
format: 'trie-1.0'
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
let pseudoclosure = new ModelDefinitions(modelSource);
|
|
109
|
+
|
|
110
|
+
const testCases: [string, string][] = [
|
|
111
|
+
// Note: not written the Turkish way. Turns out 'İ'.toLowerCase() decomposes the result,
|
|
112
|
+
// which would have made for a fairly yucky test.
|
|
113
|
+
['Istanbul', 'istanbul'],
|
|
114
|
+
|
|
115
|
+
['DİYARBAKIR', 'diyarbakir'],
|
|
116
|
+
|
|
117
|
+
// full-width romaji has corresponding capitalized versions:
|
|
118
|
+
['AESTHETIC', 'aesthetic'],
|
|
119
|
+
|
|
120
|
+
// "skýlos" is Greek for dog 🇬🇷🐶
|
|
121
|
+
// starts with an 's' and ends with an 's'
|
|
122
|
+
// which are DIFFERENT CHARACTERS in lowercased Greek!
|
|
123
|
+
['σκύλος', 'σκυλος'],
|
|
124
|
+
['ΣΚΥΛΟΣ', 'σκυλοσ'], // the keyed version after lowercasing doesn't know how
|
|
125
|
+
// to make the distinction. Both 'Σ's have the same char-code.
|
|
126
|
+
|
|
127
|
+
// Uncased syntax and numbers should pass through unscathed:
|
|
128
|
+
['1234.?!', '1234.?!']
|
|
129
|
+
];
|
|
130
|
+
|
|
131
|
+
for (let [input, keyed] of testCases) {
|
|
132
|
+
it(`should key '${input}' as '${keyed}'`, function() {
|
|
133
|
+
assert.equal(pseudoclosure.searchTermToKey(input), keyed);
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
describe('Model-defined applyCasing + (dependent) searchTermToKey', function() {
|
|
139
|
+
// Note: this test only implements enough Turkish-related stuff to facilitate
|
|
140
|
+
// a functional test. Not guaranteed to be sufficient for actual Turkish use.
|
|
141
|
+
let turkishCasing = function(form: CasingForm, text: string, defaultApplyCasing: (form: CasingForm, text: string) => string): string {
|
|
142
|
+
switch(form) {
|
|
143
|
+
case 'lower':
|
|
144
|
+
return defaultApplyCasing(form, text
|
|
145
|
+
.replace(/I/g, 'ı')
|
|
146
|
+
.replace(/İ/g, 'i'));
|
|
147
|
+
case 'upper':
|
|
148
|
+
return defaultApplyCasing(form, text
|
|
149
|
+
.replace(/ı/g, 'I')
|
|
150
|
+
.replace(/i/g, 'İ'));
|
|
151
|
+
case 'initial':
|
|
152
|
+
return turkishCasing('upper', text.charAt(0), defaultApplyCasing) + text.substr(1);
|
|
153
|
+
default:
|
|
154
|
+
return text;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
let modelSource: LexicalModelSource = {
|
|
159
|
+
languageUsesCasing: true,
|
|
160
|
+
applyCasing: turkishCasing,
|
|
161
|
+
searchTermToKey: function(wordform: string, applyCasing: CasingFunction): string {
|
|
162
|
+
return Array.from(wordform
|
|
163
|
+
.normalize('NFC') // Mostly to avoid decomposing 'İ'
|
|
164
|
+
) // end of `Array.from`
|
|
165
|
+
.map(function(c) { return applyCasing('lower', c)}) // Will use custom `applyCasing` definition!
|
|
166
|
+
.join('');
|
|
167
|
+
},
|
|
168
|
+
sources: [],
|
|
169
|
+
format: 'trie-1.0'
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
let pseudoclosure = new ModelDefinitions(modelSource);
|
|
173
|
+
|
|
174
|
+
const testCases: [string, string, string][] = [
|
|
175
|
+
['İstanbul', 'istanbul', 'istanbul'],
|
|
176
|
+
|
|
177
|
+
// The DEFAULT function is NOT responsible for understanding the Turkish
|
|
178
|
+
// case regarding the lowercasing of:
|
|
179
|
+
// 'I' U+0048 LATIN CAPITAL LETTER I to 'ı' U+0131 LATIN SMALL LETTER DOTLESS I
|
|
180
|
+
// For Turkic languages, the recommendation is to make a
|
|
181
|
+
// custom applyCasing function:
|
|
182
|
+
['DİYARBAKIR', 'diyarbakır', 'diyarbakır'],
|
|
183
|
+
|
|
184
|
+
// Uncased syntax and numbers should pass through unscathed:
|
|
185
|
+
['1234.?!', '1234.?!', '1234.?!']
|
|
186
|
+
];
|
|
187
|
+
|
|
188
|
+
for (let [input, cased, keyed] of testCases) {
|
|
189
|
+
it(`should case '${input}' as '${cased}'`, function() {
|
|
190
|
+
assert.equal(pseudoclosure.applyCasing('lower', input), cased);
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
it(`should key '${input}' as '${keyed}'`, function() {
|
|
194
|
+
assert.equal(pseudoclosure.searchTermToKey(input), keyed);
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
});
|
|
198
|
+
});
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { assert } from "chai";
|
|
2
|
+
import defaultWordBreaker from './wordbreakers/default-wordbreaker-esm.js';
|
|
3
|
+
import {decorateWithScriptOverrides} from '../src/script-overrides-decorator.js';
|
|
4
|
+
|
|
5
|
+
const THIN_SPACE = "\u2009";
|
|
6
|
+
|
|
7
|
+
describe('The script overrides word breaker decorator', function () {
|
|
8
|
+
it('should decorate an existing word breaker', function () {
|
|
9
|
+
let breakWords = decorateWithScriptOverrides(defaultWordBreaker, 'break-words-at-spaces')
|
|
10
|
+
assert.isFunction(breakWords);
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
// I do not read this script or language so I have NO idea what this says
|
|
14
|
+
// ¯\_(ツ)_/¯
|
|
15
|
+
const phraseComponents = ["ຈາກ", THIN_SPACE, "ກ໌ນິ", THIN_SPACE, "ສນາ", THIN_SPACE, "ເກ໌າະ", THIN_SPACE, "ຢັອຫ", THIN_SPACE, "ລະ", THIN_SPACE, "ບຣອມ", THIN_SPACE, "ເຢາະ", ","];
|
|
16
|
+
const phraseSpans = phraseComponents.filter(span => span !== THIN_SPACE);
|
|
17
|
+
const phrase = phraseComponents.join("");
|
|
18
|
+
const expectedNumSpans = phraseSpans.length;
|
|
19
|
+
|
|
20
|
+
it(`should break «${[phrase]}» as ${expectedNumSpans} spans`, function () {
|
|
21
|
+
let breakWords = decorateWithScriptOverrides(defaultWordBreaker, 'break-words-at-spaces');
|
|
22
|
+
let defaultResult = defaultWordBreaker(phrase);
|
|
23
|
+
|
|
24
|
+
assert.isAbove(defaultResult.length, expectedNumSpans);
|
|
25
|
+
let actualResult = breakWords(phrase);
|
|
26
|
+
assert.lengthOf(actualResult, expectedNumSpans);
|
|
27
|
+
assert.deepEqual(actualResult.map(grabText), phraseSpans);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
function grabText(span: Span) {
|
|
31
|
+
return span.text;
|
|
32
|
+
}
|
|
33
|
+
});
|