@shaxpir/duiduidui-models 1.9.25 → 1.9.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ import { Conditions } from '../models/Condition';
2
+ /**
3
+ * Represents a difficulty range with guaranteed min (defaults to 0).
4
+ * Used for intersecting pool difficulty ranges with Collection constraints.
5
+ */
6
+ export interface DifficultyRange {
7
+ min: number;
8
+ max: number | undefined;
9
+ }
10
+ /**
11
+ * Extracts difficulty constraints from Conditions.
12
+ *
13
+ * Checks the 'all' section for difficulty conditions since that's where
14
+ * Collection-level constraints are placed (they must all be satisfied).
15
+ *
16
+ * Exploits the invariant that difficulty and skill level are always >= 0,
17
+ * so min defaults to 0 when not specified.
18
+ *
19
+ * @param conditions - The conditions to search
20
+ * @returns DifficultyRange if a difficulty condition is found, null otherwise
21
+ */
22
+ export declare function extractDifficultyConstraints(conditions?: Conditions): DifficultyRange | null;
23
+ /**
24
+ * Result of intersecting difficulty ranges.
25
+ */
26
+ export interface DifficultyRangeIntersection {
27
+ /** The effective minimum difficulty to use */
28
+ min: number;
29
+ /** The effective maximum difficulty to use */
30
+ max: number;
31
+ /** True if a valid intersection was found, false if fell back to Collection range */
32
+ usedIntersection: boolean;
33
+ /** True if Collection had a difficulty constraint */
34
+ hadCollectionConstraint: boolean;
35
+ }
36
+ /**
37
+ * Calculates the intersection of a pool's difficulty range with Collection constraints.
38
+ *
39
+ * If no intersection exists (ranges are disjoint), returns the Collection's range
40
+ * since the Collection constraints take precedence over skill-based calculations.
41
+ *
42
+ * @param poolMin - The pool's calculated minimum difficulty
43
+ * @param poolMax - The pool's calculated maximum difficulty
44
+ * @param collectionRange - The Collection's difficulty constraints (if any)
45
+ * @returns The effective range to use for queries, plus metadata
46
+ */
47
+ export declare function intersectDifficultyRanges(poolMin: number, poolMax: number, collectionRange: DifficultyRange | null): DifficultyRangeIntersection;
48
+ /**
49
+ * Checks if a difficulty value falls within a range.
50
+ *
51
+ * Useful for in-memory filtering in strategies that don't use database queries
52
+ * (ReinforcementPoolStrategy, StalePoolStrategy, DecompositionPoolStrategy).
53
+ *
54
+ * Uses half-open interval semantics: min is inclusive (>=), max is exclusive (<).
55
+ *
56
+ * @param difficulty - The difficulty value to check
57
+ * @param range - The range to check against (null means no constraint)
58
+ * @returns true if difficulty is within range or no range specified
59
+ */
60
+ export declare function isWithinDifficultyRange(difficulty: number, range: DifficultyRange | null): boolean;
@@ -0,0 +1,95 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.extractDifficultyConstraints = extractDifficultyConstraints;
4
+ exports.intersectDifficultyRanges = intersectDifficultyRanges;
5
+ exports.isWithinDifficultyRange = isWithinDifficultyRange;
6
+ /**
7
+ * Extracts difficulty constraints from Conditions.
8
+ *
9
+ * Checks the 'all' section for difficulty conditions since that's where
10
+ * Collection-level constraints are placed (they must all be satisfied).
11
+ *
12
+ * Exploits the invariant that difficulty and skill level are always >= 0,
13
+ * so min defaults to 0 when not specified.
14
+ *
15
+ * @param conditions - The conditions to search
16
+ * @returns DifficultyRange if a difficulty condition is found, null otherwise
17
+ */
18
+ function extractDifficultyConstraints(conditions) {
19
+ if (!conditions?.all)
20
+ return null;
21
+ for (const condition of conditions.all) {
22
+ if (condition.type === 'difficulty') {
23
+ const diffCondition = condition;
24
+ return {
25
+ min: diffCondition.min ?? 0,
26
+ max: diffCondition.max
27
+ };
28
+ }
29
+ }
30
+ return null;
31
+ }
32
+ /**
33
+ * Calculates the intersection of a pool's difficulty range with Collection constraints.
34
+ *
35
+ * If no intersection exists (ranges are disjoint), returns the Collection's range
36
+ * since the Collection constraints take precedence over skill-based calculations.
37
+ *
38
+ * @param poolMin - The pool's calculated minimum difficulty
39
+ * @param poolMax - The pool's calculated maximum difficulty
40
+ * @param collectionRange - The Collection's difficulty constraints (if any)
41
+ * @returns The effective range to use for queries, plus metadata
42
+ */
43
+ function intersectDifficultyRanges(poolMin, poolMax, collectionRange) {
44
+ // No Collection constraint - use pool's range as-is
45
+ if (!collectionRange) {
46
+ return {
47
+ min: poolMin,
48
+ max: poolMax,
49
+ usedIntersection: true,
50
+ hadCollectionConstraint: false
51
+ };
52
+ }
53
+ // Calculate intersection
54
+ const intersectMin = Math.max(poolMin, collectionRange.min);
55
+ const intersectMax = collectionRange.max !== undefined
56
+ ? Math.min(poolMax, collectionRange.max)
57
+ : poolMax; // No upper bound on Collection, use pool's max
58
+ // Check if intersection is valid (non-empty range)
59
+ if (intersectMin < intersectMax) {
60
+ return {
61
+ min: intersectMin,
62
+ max: intersectMax,
63
+ usedIntersection: true,
64
+ hadCollectionConstraint: true
65
+ };
66
+ }
67
+ // No valid intersection - fall back to Collection's range
68
+ return {
69
+ min: collectionRange.min,
70
+ max: collectionRange.max ?? poolMax, // Use pool max if Collection has no upper bound
71
+ usedIntersection: false,
72
+ hadCollectionConstraint: true
73
+ };
74
+ }
75
+ /**
76
+ * Checks if a difficulty value falls within a range.
77
+ *
78
+ * Useful for in-memory filtering in strategies that don't use database queries
79
+ * (ReinforcementPoolStrategy, StalePoolStrategy, DecompositionPoolStrategy).
80
+ *
81
+ * Uses half-open interval semantics: min is inclusive (>=), max is exclusive (<).
82
+ *
83
+ * @param difficulty - The difficulty value to check
84
+ * @param range - The range to check against (null means no constraint)
85
+ * @returns true if difficulty is within range or no range specified
86
+ */
87
+ function isWithinDifficultyRange(difficulty, range) {
88
+ if (!range)
89
+ return true;
90
+ if (difficulty < range.min)
91
+ return false;
92
+ if (range.max !== undefined && difficulty >= range.max)
93
+ return false;
94
+ return true;
95
+ }
@@ -0,0 +1,47 @@
1
+ /**
2
+ * Parser for handling compound pinyin strings with proper syllable boundaries
3
+ */
4
+ export interface PinyinParseResult {
5
+ syllables: string[];
6
+ }
7
+ export declare const PinyinParser: {
8
+ /**
9
+ * Normalize apostrophes to straight apostrophe for pinyin parsing.
10
+ * Pinyin uses straight apostrophe (') as the standard syllable separator.
11
+ */
12
+ normalizeApostrophes(text: string): string;
13
+ /**
14
+ * Parse a pinyin string into all possible valid syllable combinations
15
+ * Returns multiple parsing options for ambiguous cases
16
+ */
17
+ parseAll(text: string): PinyinParseResult[];
18
+ /**
19
+ * Parse pinyin string that contains apostrophes
20
+ */
21
+ parseWithApostrophes(text: string): PinyinParseResult[];
22
+ /**
23
+ * Parse ambiguous pinyin string (no apostrophes) into all possible valid combinations
24
+ */
25
+ parseAmbiguous(text: string): PinyinParseResult[];
26
+ /**
27
+ * Get the best parsing from multiple options
28
+ */
29
+ getBestParsing(results: PinyinParseResult[]): PinyinParseResult | null;
30
+ /**
31
+ * Parse and return only the best parsing
32
+ * Returns null if the text cannot be parsed as pinyin
33
+ */
34
+ parse(text: string): string[] | null;
35
+ /**
36
+ * Check if a string could be compound pinyin
37
+ */
38
+ couldBeCompoundPinyin(text: string): boolean;
39
+ /**
40
+ * Handle special cases like 'zhèr', 'nǎr', 'zhèlǐ', etc.
41
+ */
42
+ parseWithSpecialCases(text: string): PinyinParseResult[];
43
+ /**
44
+ * Validate that all syllables in a parsing are legitimate
45
+ */
46
+ validateParsing(syllables: string[]): boolean;
47
+ };
@@ -0,0 +1,153 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.PinyinParser = void 0;
4
+ const PinyinValidator_1 = require("./PinyinValidator");
5
+ exports.PinyinParser = {
6
+ /**
7
+ * Normalize apostrophes to straight apostrophe for pinyin parsing.
8
+ * Pinyin uses straight apostrophe (') as the standard syllable separator.
9
+ */
10
+ normalizeApostrophes(text) {
11
+ // U+2018 = ' (left single quotation mark)
12
+ // U+2019 = ' (right single quotation mark / curly apostrophe)
13
+ return text.replace(/[\u2018\u2019]/g, "'");
14
+ },
15
+ /**
16
+ * Parse a pinyin string into all possible valid syllable combinations
17
+ * Returns multiple parsing options for ambiguous cases
18
+ */
19
+ parseAll(text) {
20
+ if (!text || text.length === 0)
21
+ return [];
22
+ // Normalize the input (including curly apostrophes to straight)
23
+ const normalized = this.normalizeApostrophes(text.toLowerCase().trim());
24
+ // Handle explicit apostrophes first
25
+ if (normalized.includes("'")) {
26
+ return this.parseWithApostrophes(normalized);
27
+ }
28
+ // For strings without apostrophes, try all possible parsings
29
+ return this.parseAmbiguous(normalized);
30
+ },
31
+ /**
32
+ * Parse pinyin string that contains apostrophes
33
+ */
34
+ parseWithApostrophes(text) {
35
+ const parts = text.split("'");
36
+ const results = [];
37
+ // The first part doesn't have a preceding apostrophe
38
+ let firstPartParsings = this.parseAmbiguous(parts[0]);
39
+ // For subsequent parts, we know they start with a vowel (that's why there's an apostrophe)
40
+ for (let i = 1; i < parts.length; i++) {
41
+ const part = parts[i];
42
+ const partParsings = this.parseAmbiguous(part);
43
+ // Combine all previous results with all current part results
44
+ const newResults = [];
45
+ if (firstPartParsings.length === 0) {
46
+ firstPartParsings = [{ syllables: [] }];
47
+ }
48
+ for (const prevResult of firstPartParsings) {
49
+ for (const partResult of partParsings) {
50
+ newResults.push({
51
+ syllables: [...prevResult.syllables, ...partResult.syllables]
52
+ });
53
+ }
54
+ }
55
+ firstPartParsings = newResults;
56
+ }
57
+ return firstPartParsings.filter((result) => result.syllables.length > 0);
58
+ },
59
+ /**
60
+ * Parse ambiguous pinyin string (no apostrophes) into all possible valid combinations
61
+ */
62
+ parseAmbiguous(text) {
63
+ if (!text)
64
+ return [];
65
+ // Use dynamic programming to find all valid parsings
66
+ const memo = new Map();
67
+ const parseRecursive = (remaining) => {
68
+ if (remaining.length === 0) {
69
+ return [{ syllables: [] }];
70
+ }
71
+ if (memo.has(remaining)) {
72
+ return memo.get(remaining);
73
+ }
74
+ const results = [];
75
+ // Try all possible syllable lengths from longest to shortest
76
+ for (let len = Math.min(6, remaining.length); len >= 1; len--) {
77
+ const candidate = remaining.substring(0, len);
78
+ if (PinyinValidator_1.PinyinValidator.isValidPinyin(candidate)) {
79
+ const restResults = parseRecursive(remaining.substring(len));
80
+ for (const restResult of restResults) {
81
+ results.push({
82
+ syllables: [candidate, ...restResult.syllables]
83
+ });
84
+ }
85
+ }
86
+ }
87
+ memo.set(remaining, results);
88
+ return results;
89
+ };
90
+ return parseRecursive(text);
91
+ },
92
+ /**
93
+ * Get the best parsing from multiple options
94
+ */
95
+ getBestParsing(results) {
96
+ if (results.length === 0)
97
+ return null;
98
+ // Prefer fewer syllables (more natural parsing)
99
+ results.sort((a, b) => a.syllables.length - b.syllables.length);
100
+ return results[0];
101
+ },
102
+ /**
103
+ * Parse and return only the best parsing
104
+ * Returns null if the text cannot be parsed as pinyin
105
+ */
106
+ parse(text) {
107
+ const results = this.parseAll(text);
108
+ const best = this.getBestParsing(results);
109
+ return best ? best.syllables : null;
110
+ },
111
+ /**
112
+ * Check if a string could be compound pinyin
113
+ */
114
+ couldBeCompoundPinyin(text) {
115
+ const results = this.parseAll(text);
116
+ // Only consider it compound if the BEST parsing has multiple syllables
117
+ const best = this.getBestParsing(results);
118
+ return best !== null && best.syllables.length > 1;
119
+ },
120
+ /**
121
+ * Handle special cases like 'zhèr', 'nǎr', 'zhèlǐ', etc.
122
+ */
123
+ parseWithSpecialCases(text) {
124
+ const normalized = text.toLowerCase();
125
+ // Handle common r-colored syllables (儿化音)
126
+ const rColoredPatterns = [
127
+ { pattern: /^(.+)r$/, replacement: (match, base) => {
128
+ // If base + 'r' is valid, keep it; otherwise try to parse base separately
129
+ if (PinyinValidator_1.PinyinValidator.isValidPinyin(match)) {
130
+ return [{ syllables: [match] }];
131
+ }
132
+ const baseResults = this.parseAmbiguous(base);
133
+ return baseResults.map((result) => ({
134
+ syllables: [...result.syllables, 'r']
135
+ }));
136
+ } }
137
+ ];
138
+ for (const { pattern, replacement } of rColoredPatterns) {
139
+ const match = normalized.match(pattern);
140
+ if (match) {
141
+ return replacement(match[0], match[1]);
142
+ }
143
+ }
144
+ // If no special cases match, fall back to regular parsing
145
+ return this.parseAmbiguous(normalized);
146
+ },
147
+ /**
148
+ * Validate that all syllables in a parsing are legitimate
149
+ */
150
+ validateParsing(syllables) {
151
+ return syllables.every(syllable => PinyinValidator_1.PinyinValidator.isValidPinyin(syllable));
152
+ }
153
+ };
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Utility for validating pinyin tokens
3
+ */
4
+ export declare const PinyinValidator: {
5
+ /**
6
+ * Check if a string is a valid pinyin syllable (with or without tone marks)
7
+ */
8
+ isValidPinyin(text: string): boolean;
9
+ /**
10
+ * Check if a string could be a pinyin token (more lenient, for prefix matching)
11
+ */
12
+ couldBePinyinPrefix(text: string): boolean;
13
+ /**
14
+ * Split a string into potential pinyin tokens (for compound pinyin like "nihao")
15
+ * Returns empty array if not valid pinyin
16
+ */
17
+ splitPinyinTokens(text: string): string[];
18
+ };
@@ -0,0 +1,162 @@
1
+ "use strict";
2
+ /**
3
+ * Utility for validating pinyin tokens
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.PinyinValidator = void 0;
7
+ // Valid pinyin initials (including empty string for syllables like 'a', 'e')
8
+ const INITIALS = new Set([
9
+ '', // empty initial
10
+ 'b', 'p', 'm', 'f',
11
+ 'd', 't', 'n', 'l',
12
+ 'g', 'k', 'h',
13
+ 'j', 'q', 'x',
14
+ 'zh', 'ch', 'sh', 'r',
15
+ 'z', 'c', 's',
16
+ 'y', 'w'
17
+ ]);
18
+ // Valid pinyin finals
19
+ const FINALS = new Set([
20
+ 'a', 'o', 'e', 'i', 'u', 'ü', 'v', // 'v' is often used instead of 'ü'
21
+ 'ai', 'ei', 'ui', 'ao', 'ou', 'iu',
22
+ 'ie', 'üe', 've', 'ue', 'er',
23
+ 'an', 'en', 'in', 'un', 'ün', 'vn',
24
+ 'ang', 'eng', 'ing', 'ong',
25
+ 'ia', 'iao', 'ian', 'iang', 'iong',
26
+ 'ua', 'uo', 'uai', 'uan', 'uang',
27
+ 'üan', 'van', 'yuan'
28
+ ]);
29
+ // Common standalone syllables
30
+ const STANDALONE_SYLLABLES = new Set([
31
+ 'a', 'ai', 'an', 'ang', 'ao',
32
+ 'e', 'ei', 'en', 'eng', 'er',
33
+ 'o', 'ou',
34
+ 'yi', 'ya', 'yao', 'ye', 'you', 'yan', 'yang', 'yin', 'ying', 'yong',
35
+ 'wu', 'wa', 'wo', 'wai', 'wei', 'wan', 'wang', 'wen', 'weng',
36
+ 'yu', 'yue', 'yuan', 'yun'
37
+ ]);
38
+ // Tone marks that might appear in pinyin (including v with combining tone marks)
39
+ const TONE_MARKS = /[āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ]|v[\u0301\u030C\u0300]?/g;
40
+ exports.PinyinValidator = {
41
+ /**
42
+ * Check if a string is a valid pinyin syllable (with or without tone marks)
43
+ */
44
+ isValidPinyin(text) {
45
+ if (!text || text.length === 0)
46
+ return false;
47
+ // Convert to lowercase and remove tone marks for validation
48
+ const normalized = text.toLowerCase().replace(TONE_MARKS, (match) => {
49
+ // Convert tone marks back to base vowels
50
+ const toneMap = {
51
+ 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
52
+ 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
53
+ 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
54
+ 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
55
+ 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
56
+ 'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
57
+ };
58
+ // Handle v with combining tone marks
59
+ if (match.startsWith('v')) {
60
+ return 'v';
61
+ }
62
+ return toneMap[match] || match;
63
+ });
64
+ // Check if it's a standalone syllable
65
+ if (STANDALONE_SYLLABLES.has(normalized)) {
66
+ return true;
67
+ }
68
+ // Try to parse as initial + final
69
+ // Check longest possible initial first (2 chars)
70
+ if (normalized.length >= 2) {
71
+ const possibleInitial2 = normalized.substring(0, 2);
72
+ if (INITIALS.has(possibleInitial2)) {
73
+ const remaining = normalized.substring(2);
74
+ if (FINALS.has(remaining)) {
75
+ return true;
76
+ }
77
+ }
78
+ }
79
+ // Check single character initial
80
+ if (normalized.length >= 1) {
81
+ const possibleInitial1 = normalized.substring(0, 1);
82
+ if (INITIALS.has(possibleInitial1)) {
83
+ const remaining = normalized.substring(1);
84
+ if (FINALS.has(remaining)) {
85
+ return true;
86
+ }
87
+ }
88
+ }
89
+ // Check if the whole string is a valid final (for syllables without initials)
90
+ if (INITIALS.has('') && FINALS.has(normalized)) {
91
+ return true;
92
+ }
93
+ return false;
94
+ },
95
+ /**
96
+ * Check if a string could be a pinyin token (more lenient, for prefix matching)
97
+ */
98
+ couldBePinyinPrefix(text) {
99
+ if (!text || text.length === 0)
100
+ return false;
101
+ const normalized = text.toLowerCase().replace(TONE_MARKS, (match) => {
102
+ const toneMap = {
103
+ 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
104
+ 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
105
+ 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
106
+ 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
107
+ 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
108
+ 'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
109
+ };
110
+ // Handle v with combining tone marks
111
+ if (match.startsWith('v')) {
112
+ return 'v';
113
+ }
114
+ return toneMap[match] || match;
115
+ });
116
+ // Check if it's already valid pinyin
117
+ if (this.isValidPinyin(text))
118
+ return true;
119
+ // Check if any standalone syllable starts with this prefix
120
+ for (const syllable of STANDALONE_SYLLABLES) {
121
+ if (syllable.startsWith(normalized))
122
+ return true;
123
+ }
124
+ // Check if it could be the start of initial + final combination
125
+ // Check if it matches any initial exactly or partially
126
+ for (const initial of INITIALS) {
127
+ if (initial.startsWith(normalized) || normalized.startsWith(initial)) {
128
+ return true;
129
+ }
130
+ }
131
+ return false;
132
+ },
133
+ /**
134
+ * Split a string into potential pinyin tokens (for compound pinyin like "nihao")
135
+ * Returns empty array if not valid pinyin
136
+ */
137
+ splitPinyinTokens(text) {
138
+ // This is a simplified version - full implementation would need
139
+ // more sophisticated parsing to handle ambiguous cases
140
+ const tokens = [];
141
+ const normalized = text.toLowerCase();
142
+ let remaining = normalized;
143
+ while (remaining.length > 0) {
144
+ let found = false;
145
+ // Try to match the longest possible valid pinyin syllable
146
+ for (let len = Math.min(6, remaining.length); len >= 1; len--) {
147
+ const candidate = remaining.substring(0, len);
148
+ if (this.isValidPinyin(candidate)) {
149
+ tokens.push(candidate);
150
+ remaining = remaining.substring(len);
151
+ found = true;
152
+ break;
153
+ }
154
+ }
155
+ // If no valid syllable found, this isn't valid pinyin
156
+ if (!found) {
157
+ return [];
158
+ }
159
+ }
160
+ return tokens;
161
+ }
162
+ };
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Utility functions for preprocessing search queries
3
+ */
4
+ /**
5
+ * Cleans and normalizes a search string for database queries
6
+ * - Removes trailing punctuation
7
+ * - Trims whitespace
8
+ * - Preserves internal punctuation (e.g., apostrophes in "don't")
9
+ *
10
+ * @param searchString The raw search string from user input
11
+ * @returns The cleaned search string
12
+ */
13
+ export declare function preprocessSearchString(searchString: string | null | undefined): string;
14
+ /**
15
+ * Tests if a string contains Chinese characters
16
+ */
17
+ export declare function containsChineseCharacters(text: string): boolean;
18
+ /**
19
+ * Tests if a string contains only ASCII punctuation
20
+ */
21
+ export declare function isOnlyPunctuation(text: string): boolean;
@@ -0,0 +1,42 @@
1
+ "use strict";
2
+ /**
3
+ * Utility functions for preprocessing search queries
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.preprocessSearchString = preprocessSearchString;
7
+ exports.containsChineseCharacters = containsChineseCharacters;
8
+ exports.isOnlyPunctuation = isOnlyPunctuation;
9
+ /**
10
+ * Cleans and normalizes a search string for database queries
11
+ * - Removes trailing punctuation
12
+ * - Trims whitespace
13
+ * - Preserves internal punctuation (e.g., apostrophes in "don't")
14
+ *
15
+ * @param searchString The raw search string from user input
16
+ * @returns The cleaned search string
17
+ */
18
+ function preprocessSearchString(searchString) {
19
+ if (!searchString) {
20
+ return '';
21
+ }
22
+ // Trim whitespace
23
+ let cleaned = searchString.trim();
24
+ // Remove common trailing punctuation that users might accidentally include
25
+ // but preserve internal punctuation like apostrophes
26
+ cleaned = cleaned.replace(/[!?.,:;]+$/, '');
27
+ // Remove leading punctuation as well
28
+ cleaned = cleaned.replace(/^[!?.,:;]+/, '');
29
+ return cleaned;
30
+ }
31
+ /**
32
+ * Tests if a string contains Chinese characters
33
+ */
34
+ function containsChineseCharacters(text) {
35
+ return /[\u4e00-\u9fff]/.test(text);
36
+ }
37
+ /**
38
+ * Tests if a string contains only ASCII punctuation
39
+ */
40
+ function isOnlyPunctuation(text) {
41
+ return /^[!?.,:;]+$/.test(text);
42
+ }
@@ -0,0 +1,72 @@
1
+ /**
2
+ * Token types for search query tokenization
3
+ */
4
+ export declare enum TokenType {
5
+ HANZI = "hanzi",
6
+ PINYIN = "pinyin",
7
+ ENGLISH = "english",
8
+ AMBIGUOUS = "ambiguous",// Could be either pinyin or English
9
+ PUNCTUATION = "punctuation",
10
+ NUMBER = "number"
11
+ }
12
+ /**
13
+ * Represents a single token from the search query
14
+ */
15
+ export interface SearchToken {
16
+ text: string;
17
+ type: TokenType;
18
+ normalized: string;
19
+ isPossiblePinyin?: boolean;
20
+ pinyinVariants?: string[];
21
+ }
22
+ /**
23
+ * Result of tokenizing a search string
24
+ */
25
+ export interface TokenizedSearch {
26
+ tokens: SearchToken[];
27
+ hanziTokens: SearchToken[];
28
+ pinyinTokens: SearchToken[];
29
+ englishTokens: SearchToken[];
30
+ ambiguousTokens: SearchToken[];
31
+ hasHanzi: boolean;
32
+ hasPinyin: boolean;
33
+ hasEnglish: boolean;
34
+ }
35
+ /**
36
+ * Tokenizes and classifies search strings for multi-modal search
37
+ */
38
+ export declare class SearchTokenizer {
39
+ private static readonly BOUNDARY_PUNCTUATION;
40
+ private static readonly EMBEDDED_PUNCTUATION;
41
+ private static readonly APOSTROPHES;
42
+ private static readonly CHINESE_PUNCTUATION;
43
+ private static readonly ALL_PUNCTUATION;
44
+ private static readonly HANZI_REGEX;
45
+ private static readonly PINYIN_TONE_MARKS;
46
+ /**
47
+ * Main tokenization method
48
+ */
49
+ static tokenize(searchString: string): TokenizedSearch;
50
+ /**
51
+ * Splits input string into tokens, handling punctuation intelligently
52
+ */
53
+ private static splitIntoTokens;
54
+ /**
55
+ * Normalize apostrophes to curly for English database matching.
56
+ * The database stores English contractions with curly apostrophes (U+2019).
57
+ */
58
+ private static normalizeApostrophesToCurly;
59
+ /**
60
+ * Check if a token with embedded apostrophes could be pinyin with syllable separators.
61
+ * Returns the parsed syllables if valid, null otherwise.
62
+ */
63
+ private static tryParsePinyinWithApostrophe;
64
+ /**
65
+ * Classifies a single token
66
+ */
67
+ private static classifyToken;
68
+ /**
69
+ * Checks if a string contains only hanzi characters
70
+ */
71
+ private static isAllHanzi;
72
+ }
@@ -0,0 +1,300 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.SearchTokenizer = exports.TokenType = void 0;
4
+ const PinyinParser_1 = require("./PinyinParser");
5
+ /**
6
+ * Token types for search query tokenization
7
+ */
8
+ var TokenType;
9
+ (function (TokenType) {
10
+ TokenType["HANZI"] = "hanzi";
11
+ TokenType["PINYIN"] = "pinyin";
12
+ TokenType["ENGLISH"] = "english";
13
+ TokenType["AMBIGUOUS"] = "ambiguous";
14
+ TokenType["PUNCTUATION"] = "punctuation";
15
+ TokenType["NUMBER"] = "number";
16
+ })(TokenType || (exports.TokenType = TokenType = {}));
17
+ /**
18
+ * Tokenizes and classifies search strings for multi-modal search
19
+ */
20
+ class SearchTokenizer {
21
+ /**
22
+ * Main tokenization method
23
+ */
24
+ static tokenize(searchString) {
25
+ if (!searchString || !searchString.trim()) {
26
+ return {
27
+ tokens: [],
28
+ hanziTokens: [],
29
+ pinyinTokens: [],
30
+ englishTokens: [],
31
+ ambiguousTokens: [],
32
+ hasHanzi: false,
33
+ hasPinyin: false,
34
+ hasEnglish: false
35
+ };
36
+ }
37
+ // First, split the string into raw tokens
38
+ const rawTokens = this.splitIntoTokens(searchString);
39
+ // Classify each token
40
+ const classifiedTokens = rawTokens.map(token => this.classifyToken(token));
41
+ // Group tokens by type
42
+ const hanziTokens = classifiedTokens.filter(t => t.type === TokenType.HANZI);
43
+ const pinyinTokens = classifiedTokens.filter(t => t.type === TokenType.PINYIN);
44
+ const englishTokens = classifiedTokens.filter(t => t.type === TokenType.ENGLISH);
45
+ const ambiguousTokens = classifiedTokens.filter(t => t.type === TokenType.AMBIGUOUS);
46
+ return {
47
+ tokens: classifiedTokens,
48
+ hanziTokens,
49
+ pinyinTokens,
50
+ englishTokens,
51
+ ambiguousTokens,
52
+ hasHanzi: hanziTokens.length > 0,
53
+ hasPinyin: pinyinTokens.length > 0 || ambiguousTokens.some(t => t.isPossiblePinyin),
54
+ hasEnglish: englishTokens.length > 0 || ambiguousTokens.length > 0
55
+ };
56
+ }
57
+ /**
58
+ * Splits input string into tokens, handling punctuation intelligently
59
+ */
60
+ static splitIntoTokens(input) {
61
+ const tokens = [];
62
+ let current = '';
63
+ for (let i = 0; i < input.length; i++) {
64
+ const char = input[i];
65
+ const nextChar = i < input.length - 1 ? input[i + 1] : '';
66
+ const prevChar = i > 0 ? input[i - 1] : '';
67
+ // Handle whitespace - always splits tokens
68
+ if (/\s/.test(char)) {
69
+ if (current) {
70
+ tokens.push(current);
71
+ current = '';
72
+ }
73
+ continue;
74
+ }
75
+ // Handle Chinese characters - each is its own token
76
+ if (this.HANZI_REGEX.test(char)) {
77
+ if (current && !this.isAllHanzi(current)) {
78
+ tokens.push(current);
79
+ current = '';
80
+ }
81
+ current += char;
82
+ // Check if next char is also hanzi, if not, push current
83
+ if (!this.HANZI_REGEX.test(nextChar)) {
84
+ tokens.push(current);
85
+ current = '';
86
+ }
87
+ continue;
88
+ }
89
+ // Handle numbers - they should be separate tokens
90
+ if (/\d/.test(char)) {
91
+ // If we have accumulated non-digit text, push it
92
+ if (current && !/^\d+$/.test(current)) {
93
+ tokens.push(current);
94
+ current = '';
95
+ }
96
+ current += char;
97
+ // If next char is not a digit, push the number
98
+ if (!/\d/.test(nextChar)) {
99
+ tokens.push(current);
100
+ current = '';
101
+ }
102
+ continue;
103
+ }
104
+ // Handle punctuation
105
+ if (this.ALL_PUNCTUATION.test(char)) {
106
+ // Check if it's embedded punctuation (like apostrophe in "don't" or "xi'an")
107
+ if (this.EMBEDDED_PUNCTUATION.test(char)) {
108
+ // Only keep embedded if surrounded by letters (including accented pinyin vowels)
109
+ // This regex matches ASCII letters plus common pinyin tone-marked vowels
110
+ const letterPattern = /[a-zA-Z\u0101\u00E1\u01CE\u00E0\u0113\u00E9\u011B\u00E8\u012B\u00ED\u01D0\u00EC\u014D\u00F3\u01D2\u00F2\u016B\u00FA\u01D4\u00F9\u01D6\u01D8\u01DA\u01DC]/;
111
+ const isEmbedded = prevChar && nextChar &&
112
+ letterPattern.test(prevChar) &&
113
+ letterPattern.test(nextChar);
114
+ if (isEmbedded) {
115
+ current += char;
116
+ }
117
+ else {
118
+ // Treat as boundary
119
+ if (current) {
120
+ tokens.push(current);
121
+ current = '';
122
+ }
123
+ tokens.push(char);
124
+ }
125
+ }
126
+ else {
127
+ // Boundary punctuation - always splits
128
+ if (current) {
129
+ tokens.push(current);
130
+ current = '';
131
+ }
132
+ tokens.push(char);
133
+ }
134
+ continue;
135
+ }
136
+ // Regular character - add to current token
137
+ // But split if transitioning from digits
138
+ if (current && /^\d+$/.test(current)) {
139
+ tokens.push(current);
140
+ current = '';
141
+ }
142
+ current += char;
143
+ }
144
+ // Don't forget the last token
145
+ if (current) {
146
+ tokens.push(current);
147
+ }
148
+ return tokens.filter(t => t.length > 0);
149
+ }
150
+ /**
151
+ * Normalize apostrophes to curly for English database matching.
152
+ * The database stores English contractions with curly apostrophes (U+2019).
153
+ */
154
+ static normalizeApostrophesToCurly(text) {
155
+ // U+0027 = ' (straight apostrophe)
156
+ // U+2018 = ' (left single quotation mark)
157
+ // Replace both with U+2019 (right single quotation mark / curly apostrophe)
158
+ return text.replace(/[\u0027\u2018]/g, '\u2019');
159
+ }
160
+ /**
161
+ * Check if a token with embedded apostrophes could be pinyin with syllable separators.
162
+ * Returns the parsed syllables if valid, null otherwise.
163
+ */
164
+ static tryParsePinyinWithApostrophe(token) {
165
+ // Only consider tokens that contain apostrophes
166
+ if (!this.APOSTROPHES.test(token)) {
167
+ return null;
168
+ }
169
+ // Use PinyinParser which now handles both straight and curly apostrophes
170
+ const pinyinParsing = PinyinParser_1.PinyinParser.parse(token);
171
+ // Only consider it pinyin if the parsing produced multiple syllables
172
+ // (the apostrophe actually acted as a separator)
173
+ if (pinyinParsing !== null && pinyinParsing.length > 1) {
174
+ return pinyinParsing;
175
+ }
176
+ return null;
177
+ }
178
+ /**
179
+ * Classifies a single token
180
+ */
181
+ static classifyToken(token) {
182
+ const normalized = token.toLowerCase();
183
+ // Check for single-character punctuation
184
+ if (token.length === 1 && this.ALL_PUNCTUATION.test(token)) {
185
+ return {
186
+ text: token,
187
+ type: TokenType.PUNCTUATION,
188
+ normalized
189
+ };
190
+ }
191
+ // Check for numbers
192
+ if (/^\d+$/.test(token)) {
193
+ return {
194
+ text: token,
195
+ type: TokenType.NUMBER,
196
+ normalized
197
+ };
198
+ }
199
+ // Check for Chinese characters
200
+ if (this.HANZI_REGEX.test(token)) {
201
+ return {
202
+ text: token,
203
+ type: TokenType.HANZI,
204
+ normalized: token // Hanzi doesn't need lowercasing
205
+ };
206
+ }
207
+ // Check for tokens with embedded apostrophes - could be pinyin syllable separators
208
+ if (this.APOSTROPHES.test(token)) {
209
+ const pinyinWithApostrophe = this.tryParsePinyinWithApostrophe(token);
210
+ if (pinyinWithApostrophe !== null) {
211
+ // Token contains apostrophe that acts as pinyin syllable separator
212
+ // Check if it has tone marks to determine if it's definitely pinyin or ambiguous
213
+ if (this.PINYIN_TONE_MARKS.test(token)) {
214
+ return {
215
+ text: token,
216
+ type: TokenType.PINYIN,
217
+ normalized,
218
+ isPossiblePinyin: true,
219
+ pinyinVariants: pinyinWithApostrophe
220
+ };
221
+ }
222
+ else {
223
+ // Could be pinyin (xi'an) or could be English with apostrophe
224
+ // Treat as ambiguous
225
+ return {
226
+ text: token,
227
+ type: TokenType.AMBIGUOUS,
228
+ normalized,
229
+ isPossiblePinyin: true,
230
+ pinyinVariants: pinyinWithApostrophe
231
+ };
232
+ }
233
+ }
234
+ // Not valid pinyin with apostrophe - treat as English
235
+ // Normalize apostrophes to curly for database matching
236
+ return {
237
+ text: token,
238
+ type: TokenType.ENGLISH,
239
+ normalized: this.normalizeApostrophesToCurly(normalized)
240
+ };
241
+ }
242
+ // Check for pinyin with tone marks
243
+ if (this.PINYIN_TONE_MARKS.test(token)) {
244
+ const pinyinParsing = PinyinParser_1.PinyinParser.parse(token);
245
+ if (pinyinParsing !== null) {
246
+ return {
247
+ text: token,
248
+ type: TokenType.PINYIN,
249
+ normalized,
250
+ isPossiblePinyin: true,
251
+ pinyinVariants: pinyinParsing
252
+ };
253
+ }
254
+ }
255
+ // Try to parse as pinyin (without tone marks)
256
+ const pinyinParsing = PinyinParser_1.PinyinParser.parse(token);
257
+ if (pinyinParsing !== null) {
258
+ // Ambiguous - could be pinyin or English
259
+ return {
260
+ text: token,
261
+ type: TokenType.AMBIGUOUS,
262
+ normalized,
263
+ isPossiblePinyin: true,
264
+ pinyinVariants: pinyinParsing
265
+ };
266
+ }
267
+ // Default to English
268
+ return {
269
+ text: token,
270
+ type: TokenType.ENGLISH,
271
+ normalized
272
+ };
273
+ }
274
+ /**
275
+ * Checks if a string contains only hanzi characters
276
+ */
277
+ static isAllHanzi(str) {
278
+ return str.split('').every(char => this.HANZI_REGEX.test(char));
279
+ }
280
+ }
281
+ exports.SearchTokenizer = SearchTokenizer;
282
+ // Common English punctuation that should be treated as word boundaries
283
+ SearchTokenizer.BOUNDARY_PUNCTUATION = /[.!?;:,]/;
284
+ // Punctuation that can be embedded in words (apostrophes, hyphens)
285
+ // U+0027 = ' (straight apostrophe)
286
+ // U+2018 = ' (left single quotation mark)
287
+ // U+2019 = ' (right single quotation mark / curly apostrophe)
288
+ SearchTokenizer.EMBEDDED_PUNCTUATION = /[\u0027\u2018\u2019\-]/;
289
+ // Apostrophe characters (both straight and curly)
290
+ SearchTokenizer.APOSTROPHES = /[\u0027\u2018\u2019]/;
291
+ // Chinese punctuation marks
292
+ // U+2018/U+2019 = '' (curly single quotes)
293
+ // U+201C/U+201D = "" (curly double quotes)
294
+ SearchTokenizer.CHINESE_PUNCTUATION = /[。!?;:,、\u201C\u201D\u2018\u2019()《》【】]/;
295
+ // All punctuation for detection
296
+ SearchTokenizer.ALL_PUNCTUATION = /[.!?;:,\u0027\u2018\u2019\-()。!?;:,、\u201C\u201D()《》【】]/;
297
+ // Chinese character ranges
298
+ SearchTokenizer.HANZI_REGEX = /[\u4e00-\u9fff]/;
299
+ // Pinyin tone marks
300
+ SearchTokenizer.PINYIN_TONE_MARKS = /[āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ]/i;
@@ -1,6 +1,11 @@
1
1
  export * from './AvatarUri';
2
2
  export * from './ConditionMatcher';
3
3
  export * from './Database';
4
+ export * from './DifficultyRange';
4
5
  export * from './Encryption';
5
6
  export * from './Logging';
7
+ export * from './PinyinParser';
8
+ export * from './PinyinValidator';
9
+ export * from './SearchPreprocessor';
10
+ export * from './SearchTokenizer';
6
11
  export * from './SenseRankEncoder';
@@ -18,6 +18,11 @@ Object.defineProperty(exports, "__esModule", { value: true });
18
18
  __exportStar(require("./AvatarUri"), exports);
19
19
  __exportStar(require("./ConditionMatcher"), exports);
20
20
  __exportStar(require("./Database"), exports);
21
+ __exportStar(require("./DifficultyRange"), exports);
21
22
  __exportStar(require("./Encryption"), exports);
22
23
  __exportStar(require("./Logging"), exports);
24
+ __exportStar(require("./PinyinParser"), exports);
25
+ __exportStar(require("./PinyinValidator"), exports);
26
+ __exportStar(require("./SearchPreprocessor"), exports);
27
+ __exportStar(require("./SearchTokenizer"), exports);
23
28
  __exportStar(require("./SenseRankEncoder"), exports);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shaxpir/duiduidui-models",
3
- "version": "1.9.25",
3
+ "version": "1.9.27",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "https://github.com/shaxpir/duiduidui-models"