@shaxpir/duiduidui-models 1.9.26 → 1.9.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/models/Condition.js +8 -0
- package/dist/util/PinyinParser.d.ts +47 -0
- package/dist/util/PinyinParser.js +153 -0
- package/dist/util/PinyinValidator.d.ts +18 -0
- package/dist/util/PinyinValidator.js +162 -0
- package/dist/util/SearchPreprocessor.d.ts +21 -0
- package/dist/util/SearchPreprocessor.js +42 -0
- package/dist/util/SearchTokenizer.d.ts +72 -0
- package/dist/util/SearchTokenizer.js +300 -0
- package/dist/util/index.d.ts +4 -0
- package/dist/util/index.js +4 -0
- package/package.json +1 -1
package/dist/models/Condition.js
CHANGED
|
@@ -160,6 +160,14 @@ exports.Condition = {
|
|
|
160
160
|
.join(', ');
|
|
161
161
|
errors.push(`Conditions [${conflictingTypes}] in 'any' section require a term record, but has_term is false`);
|
|
162
162
|
}
|
|
163
|
+
// Check for contradictory difficulty conditions (min >= max)
|
|
164
|
+
const difficultyCondition = conditions.all?.find(c => c.type === 'difficulty');
|
|
165
|
+
if (difficultyCondition) {
|
|
166
|
+
const { min, max } = difficultyCondition;
|
|
167
|
+
if (min !== undefined && max !== undefined && min >= max) {
|
|
168
|
+
errors.push(`Min difficulty (${min}) must be less than max difficulty (${max})`);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
163
171
|
return errors;
|
|
164
172
|
},
|
|
165
173
|
// Backward compatibility aliases (deprecated - use requiresStarred/allowsStarred/excludesStarred instead)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Parser for handling compound pinyin strings with proper syllable boundaries
|
|
3
|
+
*/
|
|
4
|
+
export interface PinyinParseResult {
|
|
5
|
+
syllables: string[];
|
|
6
|
+
}
|
|
7
|
+
export declare const PinyinParser: {
|
|
8
|
+
/**
|
|
9
|
+
* Normalize apostrophes to straight apostrophe for pinyin parsing.
|
|
10
|
+
* Pinyin uses straight apostrophe (') as the standard syllable separator.
|
|
11
|
+
*/
|
|
12
|
+
normalizeApostrophes(text: string): string;
|
|
13
|
+
/**
|
|
14
|
+
* Parse a pinyin string into all possible valid syllable combinations
|
|
15
|
+
* Returns multiple parsing options for ambiguous cases
|
|
16
|
+
*/
|
|
17
|
+
parseAll(text: string): PinyinParseResult[];
|
|
18
|
+
/**
|
|
19
|
+
* Parse pinyin string that contains apostrophes
|
|
20
|
+
*/
|
|
21
|
+
parseWithApostrophes(text: string): PinyinParseResult[];
|
|
22
|
+
/**
|
|
23
|
+
* Parse ambiguous pinyin string (no apostrophes) into all possible valid combinations
|
|
24
|
+
*/
|
|
25
|
+
parseAmbiguous(text: string): PinyinParseResult[];
|
|
26
|
+
/**
|
|
27
|
+
* Get the best parsing from multiple options
|
|
28
|
+
*/
|
|
29
|
+
getBestParsing(results: PinyinParseResult[]): PinyinParseResult | null;
|
|
30
|
+
/**
|
|
31
|
+
* Parse and return only the best parsing
|
|
32
|
+
* Returns null if the text cannot be parsed as pinyin
|
|
33
|
+
*/
|
|
34
|
+
parse(text: string): string[] | null;
|
|
35
|
+
/**
|
|
36
|
+
* Check if a string could be compound pinyin
|
|
37
|
+
*/
|
|
38
|
+
couldBeCompoundPinyin(text: string): boolean;
|
|
39
|
+
/**
|
|
40
|
+
* Handle special cases like 'zhèr', 'nǎr', 'zhèlǐ', etc.
|
|
41
|
+
*/
|
|
42
|
+
parseWithSpecialCases(text: string): PinyinParseResult[];
|
|
43
|
+
/**
|
|
44
|
+
* Validate that all syllables in a parsing are legitimate
|
|
45
|
+
*/
|
|
46
|
+
validateParsing(syllables: string[]): boolean;
|
|
47
|
+
};
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.PinyinParser = void 0;
|
|
4
|
+
const PinyinValidator_1 = require("./PinyinValidator");
|
|
5
|
+
exports.PinyinParser = {
|
|
6
|
+
/**
|
|
7
|
+
* Normalize apostrophes to straight apostrophe for pinyin parsing.
|
|
8
|
+
* Pinyin uses straight apostrophe (') as the standard syllable separator.
|
|
9
|
+
*/
|
|
10
|
+
normalizeApostrophes(text) {
|
|
11
|
+
// U+2018 = ' (left single quotation mark)
|
|
12
|
+
// U+2019 = ' (right single quotation mark / curly apostrophe)
|
|
13
|
+
return text.replace(/[\u2018\u2019]/g, "'");
|
|
14
|
+
},
|
|
15
|
+
/**
|
|
16
|
+
* Parse a pinyin string into all possible valid syllable combinations
|
|
17
|
+
* Returns multiple parsing options for ambiguous cases
|
|
18
|
+
*/
|
|
19
|
+
parseAll(text) {
|
|
20
|
+
if (!text || text.length === 0)
|
|
21
|
+
return [];
|
|
22
|
+
// Normalize the input (including curly apostrophes to straight)
|
|
23
|
+
const normalized = this.normalizeApostrophes(text.toLowerCase().trim());
|
|
24
|
+
// Handle explicit apostrophes first
|
|
25
|
+
if (normalized.includes("'")) {
|
|
26
|
+
return this.parseWithApostrophes(normalized);
|
|
27
|
+
}
|
|
28
|
+
// For strings without apostrophes, try all possible parsings
|
|
29
|
+
return this.parseAmbiguous(normalized);
|
|
30
|
+
},
|
|
31
|
+
/**
|
|
32
|
+
* Parse pinyin string that contains apostrophes
|
|
33
|
+
*/
|
|
34
|
+
parseWithApostrophes(text) {
|
|
35
|
+
const parts = text.split("'");
|
|
36
|
+
const results = [];
|
|
37
|
+
// The first part doesn't have a preceding apostrophe
|
|
38
|
+
let firstPartParsings = this.parseAmbiguous(parts[0]);
|
|
39
|
+
// For subsequent parts, we know they start with a vowel (that's why there's an apostrophe)
|
|
40
|
+
for (let i = 1; i < parts.length; i++) {
|
|
41
|
+
const part = parts[i];
|
|
42
|
+
const partParsings = this.parseAmbiguous(part);
|
|
43
|
+
// Combine all previous results with all current part results
|
|
44
|
+
const newResults = [];
|
|
45
|
+
if (firstPartParsings.length === 0) {
|
|
46
|
+
firstPartParsings = [{ syllables: [] }];
|
|
47
|
+
}
|
|
48
|
+
for (const prevResult of firstPartParsings) {
|
|
49
|
+
for (const partResult of partParsings) {
|
|
50
|
+
newResults.push({
|
|
51
|
+
syllables: [...prevResult.syllables, ...partResult.syllables]
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
firstPartParsings = newResults;
|
|
56
|
+
}
|
|
57
|
+
return firstPartParsings.filter((result) => result.syllables.length > 0);
|
|
58
|
+
},
|
|
59
|
+
/**
|
|
60
|
+
* Parse ambiguous pinyin string (no apostrophes) into all possible valid combinations
|
|
61
|
+
*/
|
|
62
|
+
parseAmbiguous(text) {
|
|
63
|
+
if (!text)
|
|
64
|
+
return [];
|
|
65
|
+
// Use dynamic programming to find all valid parsings
|
|
66
|
+
const memo = new Map();
|
|
67
|
+
const parseRecursive = (remaining) => {
|
|
68
|
+
if (remaining.length === 0) {
|
|
69
|
+
return [{ syllables: [] }];
|
|
70
|
+
}
|
|
71
|
+
if (memo.has(remaining)) {
|
|
72
|
+
return memo.get(remaining);
|
|
73
|
+
}
|
|
74
|
+
const results = [];
|
|
75
|
+
// Try all possible syllable lengths from longest to shortest
|
|
76
|
+
for (let len = Math.min(6, remaining.length); len >= 1; len--) {
|
|
77
|
+
const candidate = remaining.substring(0, len);
|
|
78
|
+
if (PinyinValidator_1.PinyinValidator.isValidPinyin(candidate)) {
|
|
79
|
+
const restResults = parseRecursive(remaining.substring(len));
|
|
80
|
+
for (const restResult of restResults) {
|
|
81
|
+
results.push({
|
|
82
|
+
syllables: [candidate, ...restResult.syllables]
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
memo.set(remaining, results);
|
|
88
|
+
return results;
|
|
89
|
+
};
|
|
90
|
+
return parseRecursive(text);
|
|
91
|
+
},
|
|
92
|
+
/**
|
|
93
|
+
* Get the best parsing from multiple options
|
|
94
|
+
*/
|
|
95
|
+
getBestParsing(results) {
|
|
96
|
+
if (results.length === 0)
|
|
97
|
+
return null;
|
|
98
|
+
// Prefer fewer syllables (more natural parsing)
|
|
99
|
+
results.sort((a, b) => a.syllables.length - b.syllables.length);
|
|
100
|
+
return results[0];
|
|
101
|
+
},
|
|
102
|
+
/**
|
|
103
|
+
* Parse and return only the best parsing
|
|
104
|
+
* Returns null if the text cannot be parsed as pinyin
|
|
105
|
+
*/
|
|
106
|
+
parse(text) {
|
|
107
|
+
const results = this.parseAll(text);
|
|
108
|
+
const best = this.getBestParsing(results);
|
|
109
|
+
return best ? best.syllables : null;
|
|
110
|
+
},
|
|
111
|
+
/**
|
|
112
|
+
* Check if a string could be compound pinyin
|
|
113
|
+
*/
|
|
114
|
+
couldBeCompoundPinyin(text) {
|
|
115
|
+
const results = this.parseAll(text);
|
|
116
|
+
// Only consider it compound if the BEST parsing has multiple syllables
|
|
117
|
+
const best = this.getBestParsing(results);
|
|
118
|
+
return best !== null && best.syllables.length > 1;
|
|
119
|
+
},
|
|
120
|
+
/**
|
|
121
|
+
* Handle special cases like 'zhèr', 'nǎr', 'zhèlǐ', etc.
|
|
122
|
+
*/
|
|
123
|
+
parseWithSpecialCases(text) {
|
|
124
|
+
const normalized = text.toLowerCase();
|
|
125
|
+
// Handle common r-colored syllables (儿化音)
|
|
126
|
+
const rColoredPatterns = [
|
|
127
|
+
{ pattern: /^(.+)r$/, replacement: (match, base) => {
|
|
128
|
+
// If base + 'r' is valid, keep it; otherwise try to parse base separately
|
|
129
|
+
if (PinyinValidator_1.PinyinValidator.isValidPinyin(match)) {
|
|
130
|
+
return [{ syllables: [match] }];
|
|
131
|
+
}
|
|
132
|
+
const baseResults = this.parseAmbiguous(base);
|
|
133
|
+
return baseResults.map((result) => ({
|
|
134
|
+
syllables: [...result.syllables, 'r']
|
|
135
|
+
}));
|
|
136
|
+
} }
|
|
137
|
+
];
|
|
138
|
+
for (const { pattern, replacement } of rColoredPatterns) {
|
|
139
|
+
const match = normalized.match(pattern);
|
|
140
|
+
if (match) {
|
|
141
|
+
return replacement(match[0], match[1]);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
// If no special cases match, fall back to regular parsing
|
|
145
|
+
return this.parseAmbiguous(normalized);
|
|
146
|
+
},
|
|
147
|
+
/**
|
|
148
|
+
* Validate that all syllables in a parsing are legitimate
|
|
149
|
+
*/
|
|
150
|
+
validateParsing(syllables) {
|
|
151
|
+
return syllables.every(syllable => PinyinValidator_1.PinyinValidator.isValidPinyin(syllable));
|
|
152
|
+
}
|
|
153
|
+
};
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility for validating pinyin tokens
|
|
3
|
+
*/
|
|
4
|
+
export declare const PinyinValidator: {
|
|
5
|
+
/**
|
|
6
|
+
* Check if a string is a valid pinyin syllable (with or without tone marks)
|
|
7
|
+
*/
|
|
8
|
+
isValidPinyin(text: string): boolean;
|
|
9
|
+
/**
|
|
10
|
+
* Check if a string could be a pinyin token (more lenient, for prefix matching)
|
|
11
|
+
*/
|
|
12
|
+
couldBePinyinPrefix(text: string): boolean;
|
|
13
|
+
/**
|
|
14
|
+
* Split a string into potential pinyin tokens (for compound pinyin like "nihao")
|
|
15
|
+
* Returns empty array if not valid pinyin
|
|
16
|
+
*/
|
|
17
|
+
splitPinyinTokens(text: string): string[];
|
|
18
|
+
};
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Utility for validating pinyin tokens
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.PinyinValidator = void 0;
|
|
7
|
+
// Valid pinyin initials (including empty string for syllables like 'a', 'e')
|
|
8
|
+
const INITIALS = new Set([
|
|
9
|
+
'', // empty initial
|
|
10
|
+
'b', 'p', 'm', 'f',
|
|
11
|
+
'd', 't', 'n', 'l',
|
|
12
|
+
'g', 'k', 'h',
|
|
13
|
+
'j', 'q', 'x',
|
|
14
|
+
'zh', 'ch', 'sh', 'r',
|
|
15
|
+
'z', 'c', 's',
|
|
16
|
+
'y', 'w'
|
|
17
|
+
]);
|
|
18
|
+
// Valid pinyin finals
|
|
19
|
+
const FINALS = new Set([
|
|
20
|
+
'a', 'o', 'e', 'i', 'u', 'ü', 'v', // 'v' is often used instead of 'ü'
|
|
21
|
+
'ai', 'ei', 'ui', 'ao', 'ou', 'iu',
|
|
22
|
+
'ie', 'üe', 've', 'ue', 'er',
|
|
23
|
+
'an', 'en', 'in', 'un', 'ün', 'vn',
|
|
24
|
+
'ang', 'eng', 'ing', 'ong',
|
|
25
|
+
'ia', 'iao', 'ian', 'iang', 'iong',
|
|
26
|
+
'ua', 'uo', 'uai', 'uan', 'uang',
|
|
27
|
+
'üan', 'van', 'yuan'
|
|
28
|
+
]);
|
|
29
|
+
// Common standalone syllables
|
|
30
|
+
const STANDALONE_SYLLABLES = new Set([
|
|
31
|
+
'a', 'ai', 'an', 'ang', 'ao',
|
|
32
|
+
'e', 'ei', 'en', 'eng', 'er',
|
|
33
|
+
'o', 'ou',
|
|
34
|
+
'yi', 'ya', 'yao', 'ye', 'you', 'yan', 'yang', 'yin', 'ying', 'yong',
|
|
35
|
+
'wu', 'wa', 'wo', 'wai', 'wei', 'wan', 'wang', 'wen', 'weng',
|
|
36
|
+
'yu', 'yue', 'yuan', 'yun'
|
|
37
|
+
]);
|
|
38
|
+
// Tone marks that might appear in pinyin (including v with combining tone marks)
|
|
39
|
+
const TONE_MARKS = /[āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ]|v[\u0301\u030C\u0300]?/g;
|
|
40
|
+
exports.PinyinValidator = {
|
|
41
|
+
/**
|
|
42
|
+
* Check if a string is a valid pinyin syllable (with or without tone marks)
|
|
43
|
+
*/
|
|
44
|
+
isValidPinyin(text) {
|
|
45
|
+
if (!text || text.length === 0)
|
|
46
|
+
return false;
|
|
47
|
+
// Convert to lowercase and remove tone marks for validation
|
|
48
|
+
const normalized = text.toLowerCase().replace(TONE_MARKS, (match) => {
|
|
49
|
+
// Convert tone marks back to base vowels
|
|
50
|
+
const toneMap = {
|
|
51
|
+
'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
|
|
52
|
+
'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
|
|
53
|
+
'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
|
|
54
|
+
'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
|
|
55
|
+
'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
|
|
56
|
+
'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
|
|
57
|
+
};
|
|
58
|
+
// Handle v with combining tone marks
|
|
59
|
+
if (match.startsWith('v')) {
|
|
60
|
+
return 'v';
|
|
61
|
+
}
|
|
62
|
+
return toneMap[match] || match;
|
|
63
|
+
});
|
|
64
|
+
// Check if it's a standalone syllable
|
|
65
|
+
if (STANDALONE_SYLLABLES.has(normalized)) {
|
|
66
|
+
return true;
|
|
67
|
+
}
|
|
68
|
+
// Try to parse as initial + final
|
|
69
|
+
// Check longest possible initial first (2 chars)
|
|
70
|
+
if (normalized.length >= 2) {
|
|
71
|
+
const possibleInitial2 = normalized.substring(0, 2);
|
|
72
|
+
if (INITIALS.has(possibleInitial2)) {
|
|
73
|
+
const remaining = normalized.substring(2);
|
|
74
|
+
if (FINALS.has(remaining)) {
|
|
75
|
+
return true;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// Check single character initial
|
|
80
|
+
if (normalized.length >= 1) {
|
|
81
|
+
const possibleInitial1 = normalized.substring(0, 1);
|
|
82
|
+
if (INITIALS.has(possibleInitial1)) {
|
|
83
|
+
const remaining = normalized.substring(1);
|
|
84
|
+
if (FINALS.has(remaining)) {
|
|
85
|
+
return true;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
// Check if the whole string is a valid final (for syllables without initials)
|
|
90
|
+
if (INITIALS.has('') && FINALS.has(normalized)) {
|
|
91
|
+
return true;
|
|
92
|
+
}
|
|
93
|
+
return false;
|
|
94
|
+
},
|
|
95
|
+
/**
|
|
96
|
+
* Check if a string could be a pinyin token (more lenient, for prefix matching)
|
|
97
|
+
*/
|
|
98
|
+
couldBePinyinPrefix(text) {
|
|
99
|
+
if (!text || text.length === 0)
|
|
100
|
+
return false;
|
|
101
|
+
const normalized = text.toLowerCase().replace(TONE_MARKS, (match) => {
|
|
102
|
+
const toneMap = {
|
|
103
|
+
'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
|
|
104
|
+
'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
|
|
105
|
+
'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
|
|
106
|
+
'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
|
|
107
|
+
'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
|
|
108
|
+
'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
|
|
109
|
+
};
|
|
110
|
+
// Handle v with combining tone marks
|
|
111
|
+
if (match.startsWith('v')) {
|
|
112
|
+
return 'v';
|
|
113
|
+
}
|
|
114
|
+
return toneMap[match] || match;
|
|
115
|
+
});
|
|
116
|
+
// Check if it's already valid pinyin
|
|
117
|
+
if (this.isValidPinyin(text))
|
|
118
|
+
return true;
|
|
119
|
+
// Check if any standalone syllable starts with this prefix
|
|
120
|
+
for (const syllable of STANDALONE_SYLLABLES) {
|
|
121
|
+
if (syllable.startsWith(normalized))
|
|
122
|
+
return true;
|
|
123
|
+
}
|
|
124
|
+
// Check if it could be the start of initial + final combination
|
|
125
|
+
// Check if it matches any initial exactly or partially
|
|
126
|
+
for (const initial of INITIALS) {
|
|
127
|
+
if (initial.startsWith(normalized) || normalized.startsWith(initial)) {
|
|
128
|
+
return true;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return false;
|
|
132
|
+
},
|
|
133
|
+
/**
|
|
134
|
+
* Split a string into potential pinyin tokens (for compound pinyin like "nihao")
|
|
135
|
+
* Returns empty array if not valid pinyin
|
|
136
|
+
*/
|
|
137
|
+
splitPinyinTokens(text) {
|
|
138
|
+
// This is a simplified version - full implementation would need
|
|
139
|
+
// more sophisticated parsing to handle ambiguous cases
|
|
140
|
+
const tokens = [];
|
|
141
|
+
const normalized = text.toLowerCase();
|
|
142
|
+
let remaining = normalized;
|
|
143
|
+
while (remaining.length > 0) {
|
|
144
|
+
let found = false;
|
|
145
|
+
// Try to match the longest possible valid pinyin syllable
|
|
146
|
+
for (let len = Math.min(6, remaining.length); len >= 1; len--) {
|
|
147
|
+
const candidate = remaining.substring(0, len);
|
|
148
|
+
if (this.isValidPinyin(candidate)) {
|
|
149
|
+
tokens.push(candidate);
|
|
150
|
+
remaining = remaining.substring(len);
|
|
151
|
+
found = true;
|
|
152
|
+
break;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
// If no valid syllable found, this isn't valid pinyin
|
|
156
|
+
if (!found) {
|
|
157
|
+
return [];
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
return tokens;
|
|
161
|
+
}
|
|
162
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions for preprocessing search queries
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Cleans and normalizes a search string for database queries
|
|
6
|
+
* - Removes trailing punctuation
|
|
7
|
+
* - Trims whitespace
|
|
8
|
+
* - Preserves internal punctuation (e.g., apostrophes in "don't")
|
|
9
|
+
*
|
|
10
|
+
* @param searchString The raw search string from user input
|
|
11
|
+
* @returns The cleaned search string
|
|
12
|
+
*/
|
|
13
|
+
export declare function preprocessSearchString(searchString: string | null | undefined): string;
|
|
14
|
+
/**
|
|
15
|
+
* Tests if a string contains Chinese characters
|
|
16
|
+
*/
|
|
17
|
+
export declare function containsChineseCharacters(text: string): boolean;
|
|
18
|
+
/**
|
|
19
|
+
* Tests if a string contains only ASCII punctuation
|
|
20
|
+
*/
|
|
21
|
+
export declare function isOnlyPunctuation(text: string): boolean;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Utility functions for preprocessing search queries
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.preprocessSearchString = preprocessSearchString;
|
|
7
|
+
exports.containsChineseCharacters = containsChineseCharacters;
|
|
8
|
+
exports.isOnlyPunctuation = isOnlyPunctuation;
|
|
9
|
+
/**
|
|
10
|
+
* Cleans and normalizes a search string for database queries
|
|
11
|
+
* - Removes trailing punctuation
|
|
12
|
+
* - Trims whitespace
|
|
13
|
+
* - Preserves internal punctuation (e.g., apostrophes in "don't")
|
|
14
|
+
*
|
|
15
|
+
* @param searchString The raw search string from user input
|
|
16
|
+
* @returns The cleaned search string
|
|
17
|
+
*/
|
|
18
|
+
function preprocessSearchString(searchString) {
|
|
19
|
+
if (!searchString) {
|
|
20
|
+
return '';
|
|
21
|
+
}
|
|
22
|
+
// Trim whitespace
|
|
23
|
+
let cleaned = searchString.trim();
|
|
24
|
+
// Remove common trailing punctuation that users might accidentally include
|
|
25
|
+
// but preserve internal punctuation like apostrophes
|
|
26
|
+
cleaned = cleaned.replace(/[!?.,:;]+$/, '');
|
|
27
|
+
// Remove leading punctuation as well
|
|
28
|
+
cleaned = cleaned.replace(/^[!?.,:;]+/, '');
|
|
29
|
+
return cleaned;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Tests if a string contains Chinese characters
|
|
33
|
+
*/
|
|
34
|
+
function containsChineseCharacters(text) {
|
|
35
|
+
return /[\u4e00-\u9fff]/.test(text);
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Tests if a string contains only ASCII punctuation
|
|
39
|
+
*/
|
|
40
|
+
function isOnlyPunctuation(text) {
|
|
41
|
+
return /^[!?.,:;]+$/.test(text);
|
|
42
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token types for search query tokenization
|
|
3
|
+
*/
|
|
4
|
+
export declare enum TokenType {
|
|
5
|
+
HANZI = "hanzi",
|
|
6
|
+
PINYIN = "pinyin",
|
|
7
|
+
ENGLISH = "english",
|
|
8
|
+
AMBIGUOUS = "ambiguous",// Could be either pinyin or English
|
|
9
|
+
PUNCTUATION = "punctuation",
|
|
10
|
+
NUMBER = "number"
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Represents a single token from the search query
|
|
14
|
+
*/
|
|
15
|
+
export interface SearchToken {
|
|
16
|
+
text: string;
|
|
17
|
+
type: TokenType;
|
|
18
|
+
normalized: string;
|
|
19
|
+
isPossiblePinyin?: boolean;
|
|
20
|
+
pinyinVariants?: string[];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Result of tokenizing a search string
|
|
24
|
+
*/
|
|
25
|
+
export interface TokenizedSearch {
|
|
26
|
+
tokens: SearchToken[];
|
|
27
|
+
hanziTokens: SearchToken[];
|
|
28
|
+
pinyinTokens: SearchToken[];
|
|
29
|
+
englishTokens: SearchToken[];
|
|
30
|
+
ambiguousTokens: SearchToken[];
|
|
31
|
+
hasHanzi: boolean;
|
|
32
|
+
hasPinyin: boolean;
|
|
33
|
+
hasEnglish: boolean;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Tokenizes and classifies search strings for multi-modal search
|
|
37
|
+
*/
|
|
38
|
+
export declare class SearchTokenizer {
|
|
39
|
+
private static readonly BOUNDARY_PUNCTUATION;
|
|
40
|
+
private static readonly EMBEDDED_PUNCTUATION;
|
|
41
|
+
private static readonly APOSTROPHES;
|
|
42
|
+
private static readonly CHINESE_PUNCTUATION;
|
|
43
|
+
private static readonly ALL_PUNCTUATION;
|
|
44
|
+
private static readonly HANZI_REGEX;
|
|
45
|
+
private static readonly PINYIN_TONE_MARKS;
|
|
46
|
+
/**
|
|
47
|
+
* Main tokenization method
|
|
48
|
+
*/
|
|
49
|
+
static tokenize(searchString: string): TokenizedSearch;
|
|
50
|
+
/**
|
|
51
|
+
* Splits input string into tokens, handling punctuation intelligently
|
|
52
|
+
*/
|
|
53
|
+
private static splitIntoTokens;
|
|
54
|
+
/**
|
|
55
|
+
* Normalize apostrophes to curly for English database matching.
|
|
56
|
+
* The database stores English contractions with curly apostrophes (U+2019).
|
|
57
|
+
*/
|
|
58
|
+
private static normalizeApostrophesToCurly;
|
|
59
|
+
/**
|
|
60
|
+
* Check if a token with embedded apostrophes could be pinyin with syllable separators.
|
|
61
|
+
* Returns the parsed syllables if valid, null otherwise.
|
|
62
|
+
*/
|
|
63
|
+
private static tryParsePinyinWithApostrophe;
|
|
64
|
+
/**
|
|
65
|
+
* Classifies a single token
|
|
66
|
+
*/
|
|
67
|
+
private static classifyToken;
|
|
68
|
+
/**
|
|
69
|
+
* Checks if a string contains only hanzi characters
|
|
70
|
+
*/
|
|
71
|
+
private static isAllHanzi;
|
|
72
|
+
}
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.SearchTokenizer = exports.TokenType = void 0;
|
|
4
|
+
const PinyinParser_1 = require("./PinyinParser");
|
|
5
|
+
/**
|
|
6
|
+
* Token types for search query tokenization
|
|
7
|
+
*/
|
|
8
|
+
var TokenType;
|
|
9
|
+
(function (TokenType) {
|
|
10
|
+
TokenType["HANZI"] = "hanzi";
|
|
11
|
+
TokenType["PINYIN"] = "pinyin";
|
|
12
|
+
TokenType["ENGLISH"] = "english";
|
|
13
|
+
TokenType["AMBIGUOUS"] = "ambiguous";
|
|
14
|
+
TokenType["PUNCTUATION"] = "punctuation";
|
|
15
|
+
TokenType["NUMBER"] = "number";
|
|
16
|
+
})(TokenType || (exports.TokenType = TokenType = {}));
|
|
17
|
+
/**
|
|
18
|
+
* Tokenizes and classifies search strings for multi-modal search
|
|
19
|
+
*/
|
|
20
|
+
class SearchTokenizer {
|
|
21
|
+
/**
|
|
22
|
+
* Main tokenization method
|
|
23
|
+
*/
|
|
24
|
+
static tokenize(searchString) {
|
|
25
|
+
if (!searchString || !searchString.trim()) {
|
|
26
|
+
return {
|
|
27
|
+
tokens: [],
|
|
28
|
+
hanziTokens: [],
|
|
29
|
+
pinyinTokens: [],
|
|
30
|
+
englishTokens: [],
|
|
31
|
+
ambiguousTokens: [],
|
|
32
|
+
hasHanzi: false,
|
|
33
|
+
hasPinyin: false,
|
|
34
|
+
hasEnglish: false
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
// First, split the string into raw tokens
|
|
38
|
+
const rawTokens = this.splitIntoTokens(searchString);
|
|
39
|
+
// Classify each token
|
|
40
|
+
const classifiedTokens = rawTokens.map(token => this.classifyToken(token));
|
|
41
|
+
// Group tokens by type
|
|
42
|
+
const hanziTokens = classifiedTokens.filter(t => t.type === TokenType.HANZI);
|
|
43
|
+
const pinyinTokens = classifiedTokens.filter(t => t.type === TokenType.PINYIN);
|
|
44
|
+
const englishTokens = classifiedTokens.filter(t => t.type === TokenType.ENGLISH);
|
|
45
|
+
const ambiguousTokens = classifiedTokens.filter(t => t.type === TokenType.AMBIGUOUS);
|
|
46
|
+
return {
|
|
47
|
+
tokens: classifiedTokens,
|
|
48
|
+
hanziTokens,
|
|
49
|
+
pinyinTokens,
|
|
50
|
+
englishTokens,
|
|
51
|
+
ambiguousTokens,
|
|
52
|
+
hasHanzi: hanziTokens.length > 0,
|
|
53
|
+
hasPinyin: pinyinTokens.length > 0 || ambiguousTokens.some(t => t.isPossiblePinyin),
|
|
54
|
+
hasEnglish: englishTokens.length > 0 || ambiguousTokens.length > 0
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Splits input string into tokens, handling punctuation intelligently
|
|
59
|
+
*/
|
|
60
|
+
static splitIntoTokens(input) {
|
|
61
|
+
const tokens = [];
|
|
62
|
+
let current = '';
|
|
63
|
+
for (let i = 0; i < input.length; i++) {
|
|
64
|
+
const char = input[i];
|
|
65
|
+
const nextChar = i < input.length - 1 ? input[i + 1] : '';
|
|
66
|
+
const prevChar = i > 0 ? input[i - 1] : '';
|
|
67
|
+
// Handle whitespace - always splits tokens
|
|
68
|
+
if (/\s/.test(char)) {
|
|
69
|
+
if (current) {
|
|
70
|
+
tokens.push(current);
|
|
71
|
+
current = '';
|
|
72
|
+
}
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
// Handle Chinese characters - each is its own token
|
|
76
|
+
if (this.HANZI_REGEX.test(char)) {
|
|
77
|
+
if (current && !this.isAllHanzi(current)) {
|
|
78
|
+
tokens.push(current);
|
|
79
|
+
current = '';
|
|
80
|
+
}
|
|
81
|
+
current += char;
|
|
82
|
+
// Check if next char is also hanzi, if not, push current
|
|
83
|
+
if (!this.HANZI_REGEX.test(nextChar)) {
|
|
84
|
+
tokens.push(current);
|
|
85
|
+
current = '';
|
|
86
|
+
}
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
// Handle numbers - they should be separate tokens
|
|
90
|
+
if (/\d/.test(char)) {
|
|
91
|
+
// If we have accumulated non-digit text, push it
|
|
92
|
+
if (current && !/^\d+$/.test(current)) {
|
|
93
|
+
tokens.push(current);
|
|
94
|
+
current = '';
|
|
95
|
+
}
|
|
96
|
+
current += char;
|
|
97
|
+
// If next char is not a digit, push the number
|
|
98
|
+
if (!/\d/.test(nextChar)) {
|
|
99
|
+
tokens.push(current);
|
|
100
|
+
current = '';
|
|
101
|
+
}
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
// Handle punctuation
|
|
105
|
+
if (this.ALL_PUNCTUATION.test(char)) {
|
|
106
|
+
// Check if it's embedded punctuation (like apostrophe in "don't" or "xi'an")
|
|
107
|
+
if (this.EMBEDDED_PUNCTUATION.test(char)) {
|
|
108
|
+
// Only keep embedded if surrounded by letters (including accented pinyin vowels)
|
|
109
|
+
// This regex matches ASCII letters plus common pinyin tone-marked vowels
|
|
110
|
+
const letterPattern = /[a-zA-Z\u0101\u00E1\u01CE\u00E0\u0113\u00E9\u011B\u00E8\u012B\u00ED\u01D0\u00EC\u014D\u00F3\u01D2\u00F2\u016B\u00FA\u01D4\u00F9\u01D6\u01D8\u01DA\u01DC]/;
|
|
111
|
+
const isEmbedded = prevChar && nextChar &&
|
|
112
|
+
letterPattern.test(prevChar) &&
|
|
113
|
+
letterPattern.test(nextChar);
|
|
114
|
+
if (isEmbedded) {
|
|
115
|
+
current += char;
|
|
116
|
+
}
|
|
117
|
+
else {
|
|
118
|
+
// Treat as boundary
|
|
119
|
+
if (current) {
|
|
120
|
+
tokens.push(current);
|
|
121
|
+
current = '';
|
|
122
|
+
}
|
|
123
|
+
tokens.push(char);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
// Boundary punctuation - always splits
|
|
128
|
+
if (current) {
|
|
129
|
+
tokens.push(current);
|
|
130
|
+
current = '';
|
|
131
|
+
}
|
|
132
|
+
tokens.push(char);
|
|
133
|
+
}
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
// Regular character - add to current token
|
|
137
|
+
// But split if transitioning from digits
|
|
138
|
+
if (current && /^\d+$/.test(current)) {
|
|
139
|
+
tokens.push(current);
|
|
140
|
+
current = '';
|
|
141
|
+
}
|
|
142
|
+
current += char;
|
|
143
|
+
}
|
|
144
|
+
// Don't forget the last token
|
|
145
|
+
if (current) {
|
|
146
|
+
tokens.push(current);
|
|
147
|
+
}
|
|
148
|
+
return tokens.filter(t => t.length > 0);
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Normalize apostrophes to curly for English database matching.
|
|
152
|
+
* The database stores English contractions with curly apostrophes (U+2019).
|
|
153
|
+
*/
|
|
154
|
+
static normalizeApostrophesToCurly(text) {
|
|
155
|
+
// U+0027 = ' (straight apostrophe)
|
|
156
|
+
// U+2018 = ' (left single quotation mark)
|
|
157
|
+
// Replace both with U+2019 (right single quotation mark / curly apostrophe)
|
|
158
|
+
return text.replace(/[\u0027\u2018]/g, '\u2019');
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Check if a token with embedded apostrophes could be pinyin with syllable separators.
|
|
162
|
+
* Returns the parsed syllables if valid, null otherwise.
|
|
163
|
+
*/
|
|
164
|
+
static tryParsePinyinWithApostrophe(token) {
|
|
165
|
+
// Only consider tokens that contain apostrophes
|
|
166
|
+
if (!this.APOSTROPHES.test(token)) {
|
|
167
|
+
return null;
|
|
168
|
+
}
|
|
169
|
+
// Use PinyinParser which now handles both straight and curly apostrophes
|
|
170
|
+
const pinyinParsing = PinyinParser_1.PinyinParser.parse(token);
|
|
171
|
+
// Only consider it pinyin if the parsing produced multiple syllables
|
|
172
|
+
// (the apostrophe actually acted as a separator)
|
|
173
|
+
if (pinyinParsing !== null && pinyinParsing.length > 1) {
|
|
174
|
+
return pinyinParsing;
|
|
175
|
+
}
|
|
176
|
+
return null;
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Classifies a single token
|
|
180
|
+
*/
|
|
181
|
+
static classifyToken(token) {
|
|
182
|
+
const normalized = token.toLowerCase();
|
|
183
|
+
// Check for single-character punctuation
|
|
184
|
+
if (token.length === 1 && this.ALL_PUNCTUATION.test(token)) {
|
|
185
|
+
return {
|
|
186
|
+
text: token,
|
|
187
|
+
type: TokenType.PUNCTUATION,
|
|
188
|
+
normalized
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
// Check for numbers
|
|
192
|
+
if (/^\d+$/.test(token)) {
|
|
193
|
+
return {
|
|
194
|
+
text: token,
|
|
195
|
+
type: TokenType.NUMBER,
|
|
196
|
+
normalized
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
// Check for Chinese characters
|
|
200
|
+
if (this.HANZI_REGEX.test(token)) {
|
|
201
|
+
return {
|
|
202
|
+
text: token,
|
|
203
|
+
type: TokenType.HANZI,
|
|
204
|
+
normalized: token // Hanzi doesn't need lowercasing
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
// Check for tokens with embedded apostrophes - could be pinyin syllable separators
|
|
208
|
+
if (this.APOSTROPHES.test(token)) {
|
|
209
|
+
const pinyinWithApostrophe = this.tryParsePinyinWithApostrophe(token);
|
|
210
|
+
if (pinyinWithApostrophe !== null) {
|
|
211
|
+
// Token contains apostrophe that acts as pinyin syllable separator
|
|
212
|
+
// Check if it has tone marks to determine if it's definitely pinyin or ambiguous
|
|
213
|
+
if (this.PINYIN_TONE_MARKS.test(token)) {
|
|
214
|
+
return {
|
|
215
|
+
text: token,
|
|
216
|
+
type: TokenType.PINYIN,
|
|
217
|
+
normalized,
|
|
218
|
+
isPossiblePinyin: true,
|
|
219
|
+
pinyinVariants: pinyinWithApostrophe
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
else {
|
|
223
|
+
// Could be pinyin (xi'an) or could be English with apostrophe
|
|
224
|
+
// Treat as ambiguous
|
|
225
|
+
return {
|
|
226
|
+
text: token,
|
|
227
|
+
type: TokenType.AMBIGUOUS,
|
|
228
|
+
normalized,
|
|
229
|
+
isPossiblePinyin: true,
|
|
230
|
+
pinyinVariants: pinyinWithApostrophe
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
// Not valid pinyin with apostrophe - treat as English
|
|
235
|
+
// Normalize apostrophes to curly for database matching
|
|
236
|
+
return {
|
|
237
|
+
text: token,
|
|
238
|
+
type: TokenType.ENGLISH,
|
|
239
|
+
normalized: this.normalizeApostrophesToCurly(normalized)
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
// Check for pinyin with tone marks
|
|
243
|
+
if (this.PINYIN_TONE_MARKS.test(token)) {
|
|
244
|
+
const pinyinParsing = PinyinParser_1.PinyinParser.parse(token);
|
|
245
|
+
if (pinyinParsing !== null) {
|
|
246
|
+
return {
|
|
247
|
+
text: token,
|
|
248
|
+
type: TokenType.PINYIN,
|
|
249
|
+
normalized,
|
|
250
|
+
isPossiblePinyin: true,
|
|
251
|
+
pinyinVariants: pinyinParsing
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
// Try to parse as pinyin (without tone marks)
|
|
256
|
+
const pinyinParsing = PinyinParser_1.PinyinParser.parse(token);
|
|
257
|
+
if (pinyinParsing !== null) {
|
|
258
|
+
// Ambiguous - could be pinyin or English
|
|
259
|
+
return {
|
|
260
|
+
text: token,
|
|
261
|
+
type: TokenType.AMBIGUOUS,
|
|
262
|
+
normalized,
|
|
263
|
+
isPossiblePinyin: true,
|
|
264
|
+
pinyinVariants: pinyinParsing
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
// Default to English
|
|
268
|
+
return {
|
|
269
|
+
text: token,
|
|
270
|
+
type: TokenType.ENGLISH,
|
|
271
|
+
normalized
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Checks if a string contains only hanzi characters
|
|
276
|
+
*/
|
|
277
|
+
static isAllHanzi(str) {
|
|
278
|
+
return str.split('').every(char => this.HANZI_REGEX.test(char));
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
exports.SearchTokenizer = SearchTokenizer;
|
|
282
|
+
// Common English punctuation that should be treated as word boundaries
|
|
283
|
+
SearchTokenizer.BOUNDARY_PUNCTUATION = /[.!?;:,]/;
|
|
284
|
+
// Punctuation that can be embedded in words (apostrophes, hyphens)
|
|
285
|
+
// U+0027 = ' (straight apostrophe)
|
|
286
|
+
// U+2018 = ' (left single quotation mark)
|
|
287
|
+
// U+2019 = ' (right single quotation mark / curly apostrophe)
|
|
288
|
+
SearchTokenizer.EMBEDDED_PUNCTUATION = /[\u0027\u2018\u2019\-]/;
|
|
289
|
+
// Apostrophe characters (both straight and curly)
|
|
290
|
+
SearchTokenizer.APOSTROPHES = /[\u0027\u2018\u2019]/;
|
|
291
|
+
// Chinese punctuation marks
|
|
292
|
+
// U+2018/U+2019 = '' (curly single quotes)
|
|
293
|
+
// U+201C/U+201D = "" (curly double quotes)
|
|
294
|
+
SearchTokenizer.CHINESE_PUNCTUATION = /[。!?;:,、\u201C\u201D\u2018\u2019()《》【】]/;
|
|
295
|
+
// All punctuation for detection
|
|
296
|
+
SearchTokenizer.ALL_PUNCTUATION = /[.!?;:,\u0027\u2018\u2019\-()。!?;:,、\u201C\u201D()《》【】]/;
|
|
297
|
+
// Chinese character ranges
|
|
298
|
+
SearchTokenizer.HANZI_REGEX = /[\u4e00-\u9fff]/;
|
|
299
|
+
// Pinyin tone marks
|
|
300
|
+
SearchTokenizer.PINYIN_TONE_MARKS = /[āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ]/i;
|
package/dist/util/index.d.ts
CHANGED
|
@@ -4,4 +4,8 @@ export * from './Database';
|
|
|
4
4
|
export * from './DifficultyRange';
|
|
5
5
|
export * from './Encryption';
|
|
6
6
|
export * from './Logging';
|
|
7
|
+
export * from './PinyinParser';
|
|
8
|
+
export * from './PinyinValidator';
|
|
9
|
+
export * from './SearchPreprocessor';
|
|
10
|
+
export * from './SearchTokenizer';
|
|
7
11
|
export * from './SenseRankEncoder';
|
package/dist/util/index.js
CHANGED
|
@@ -21,4 +21,8 @@ __exportStar(require("./Database"), exports);
|
|
|
21
21
|
__exportStar(require("./DifficultyRange"), exports);
|
|
22
22
|
__exportStar(require("./Encryption"), exports);
|
|
23
23
|
__exportStar(require("./Logging"), exports);
|
|
24
|
+
__exportStar(require("./PinyinParser"), exports);
|
|
25
|
+
__exportStar(require("./PinyinValidator"), exports);
|
|
26
|
+
__exportStar(require("./SearchPreprocessor"), exports);
|
|
27
|
+
__exportStar(require("./SearchTokenizer"), exports);
|
|
24
28
|
__exportStar(require("./SenseRankEncoder"), exports);
|