@inseefr/lunatic 3.4.7 → 3.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/type.source.d.ts +25 -40
- package/esm/type.source.js +1 -0
- package/esm/type.source.js.map +1 -1
- package/esm/utils/search/SearchMiniSearch.spec.d.ts +1 -0
- package/esm/utils/search/SearchMiniSearch.spec.js +51 -0
- package/esm/utils/search/SearchMiniSearch.spec.js.map +1 -0
- package/esm/utils/search/melauto.js +1 -1
- package/esm/utils/search/melauto.spec.d.ts +1 -0
- package/esm/utils/search/melauto.spec.js +67 -0
- package/esm/utils/search/melauto.spec.js.map +1 -0
- package/esm/utils/search/tokenizer.d.ts +7 -2
- package/esm/utils/search/tokenizer.js +23 -8
- package/esm/utils/search/tokenizer.js.map +1 -1
- package/esm/utils/search/tokenizer.spec.d.ts +1 -0
- package/esm/utils/search/tokenizer.spec.js +160 -0
- package/esm/utils/search/tokenizer.spec.js.map +1 -0
- package/package.json +23 -1
- package/src/type.source.ts +93 -108
- package/src/utils/search/SearchMiniSearch.spec.ts +58 -0
- package/src/utils/search/melauto.spec.ts +75 -0
- package/src/utils/search/melauto.ts +1 -1
- package/src/utils/search/tokenizer.spec.ts +205 -0
- package/src/utils/search/tokenizer.ts +27 -8
- package/tsconfig.build.tsbuildinfo +1 -1
- package/type.source.d.ts +25 -40
- package/type.source.js +1 -0
- package/type.source.js.map +1 -1
- package/utils/search/SearchMiniSearch.spec.d.ts +1 -0
- package/utils/search/SearchMiniSearch.spec.js +51 -0
- package/utils/search/SearchMiniSearch.spec.js.map +1 -0
- package/utils/search/melauto.js +1 -1
- package/utils/search/melauto.spec.d.ts +1 -0
- package/utils/search/melauto.spec.js +69 -0
- package/utils/search/melauto.spec.js.map +1 -0
- package/utils/search/tokenizer.d.ts +7 -2
- package/utils/search/tokenizer.js +24 -8
- package/utils/search/tokenizer.js.map +1 -1
- package/utils/search/tokenizer.spec.d.ts +1 -0
- package/utils/search/tokenizer.spec.js +162 -0
- package/utils/search/tokenizer.spec.js.map +1 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import {
|
|
3
|
+
tokenizer,
|
|
4
|
+
tokenizeQuery,
|
|
5
|
+
tokenizeIndex,
|
|
6
|
+
filterStopWords,
|
|
7
|
+
} from './tokenizer';
|
|
8
|
+
import type { SearchInfo } from './SearchInterface';
|
|
9
|
+
|
|
10
|
+
const mockSearchInfo: SearchInfo = {
|
|
11
|
+
name: 'Products',
|
|
12
|
+
fields: [
|
|
13
|
+
{
|
|
14
|
+
name: 'title',
|
|
15
|
+
min: 3,
|
|
16
|
+
rules: ['[\\w]+'],
|
|
17
|
+
synonyms: {
|
|
18
|
+
car: ['vehicle', 'automobile'],
|
|
19
|
+
},
|
|
20
|
+
},
|
|
21
|
+
],
|
|
22
|
+
queryParser: {
|
|
23
|
+
type: 'tokenized',
|
|
24
|
+
params: {
|
|
25
|
+
language: 'English',
|
|
26
|
+
pattern: '[\\w.]+',
|
|
27
|
+
min: 2,
|
|
28
|
+
},
|
|
29
|
+
},
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
describe('filterStopWords', () => {
|
|
33
|
+
it('should remove only stopwords from the input string', () => {
|
|
34
|
+
const input = 'This is a test.';
|
|
35
|
+
const stopWords = ['is', 'a'];
|
|
36
|
+
const result = filterStopWords(input, stopWords);
|
|
37
|
+
expect(result).toBe('This test.');
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it('should not alter words that are substrings of stopwords', () => {
|
|
41
|
+
const input = 'this is a testing example';
|
|
42
|
+
const stopWords = ['test'];
|
|
43
|
+
const result = filterStopWords(input, stopWords);
|
|
44
|
+
expect(result).toBe('this is a testing example');
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('should be case-insensitive', () => {
|
|
48
|
+
const input = 'This Is A Test.';
|
|
49
|
+
const stopWords = ['is', 'a'];
|
|
50
|
+
const result = filterStopWords(input, stopWords);
|
|
51
|
+
expect(result).toBe('This Test.');
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it('should return the input string unchanged if stopWords is undefined', () => {
|
|
55
|
+
const input = 'This is a test.';
|
|
56
|
+
const result = filterStopWords(input);
|
|
57
|
+
expect(result).toBe(input);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('should return the input string unchanged if stopWords is a empty array', () => {
|
|
61
|
+
const input = 'This is a test.';
|
|
62
|
+
const stopWords: string[] = [];
|
|
63
|
+
const result = filterStopWords(input, stopWords);
|
|
64
|
+
expect(result).toBe(input);
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it('should return an empty string if all words are stopwords', () => {
|
|
68
|
+
const input = 'this is a test';
|
|
69
|
+
const stopWords = ['this', 'is', 'a', 'test'];
|
|
70
|
+
const result = filterStopWords(input, stopWords);
|
|
71
|
+
expect(result).toBe('');
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it('should handle strings with multiple spaces correctly', () => {
|
|
75
|
+
const input = 'This is a test.';
|
|
76
|
+
const stopWords = ['is', 'a'];
|
|
77
|
+
const result = filterStopWords(input, stopWords);
|
|
78
|
+
expect(result).toBe('This test.');
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('should handle empty input string', () => {
|
|
82
|
+
const input = '';
|
|
83
|
+
const stopWords = ['is', 'a'];
|
|
84
|
+
const result = filterStopWords(input, stopWords);
|
|
85
|
+
expect(result).toBe('');
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
it('should handle punctuation correctly', () => {
|
|
89
|
+
const input = 'Hello, world! This is a test.';
|
|
90
|
+
const stopWords = ['is', 'a'];
|
|
91
|
+
const result = filterStopWords(input, stopWords);
|
|
92
|
+
expect(result).toBe('Hello, world! This test.');
|
|
93
|
+
});
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
describe('tokenizeQuery', () => {
|
|
97
|
+
it('should tokenize based on soft type', () => {
|
|
98
|
+
const queryParser = { type: 'soft' } as SearchInfo['queryParser'];
|
|
99
|
+
|
|
100
|
+
const result = tokenizeQuery('This is a test!', queryParser);
|
|
101
|
+
expect(result).toEqual(['this', 'is', 'a', 'test']);
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it('should tokenize with a custom regex and a min', () => {
|
|
105
|
+
const queryParser = {
|
|
106
|
+
type: 'tokenized',
|
|
107
|
+
params: { pattern: '[\\w.]+', min: 3 },
|
|
108
|
+
} as SearchInfo['queryParser'];
|
|
109
|
+
|
|
110
|
+
const result = tokenizeQuery('This is a test !', queryParser);
|
|
111
|
+
expect(result).toEqual(['this', 'test']);
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
it('should normalize the input', () => {
|
|
115
|
+
const queryParser = {
|
|
116
|
+
type: 'tokenized',
|
|
117
|
+
params: { pattern: '\\w+', min: 1 },
|
|
118
|
+
} as SearchInfo['queryParser'];
|
|
119
|
+
|
|
120
|
+
const result = tokenizeQuery('Élève Étudiant!', queryParser);
|
|
121
|
+
expect(result).toEqual(['eleve', 'etudiant']);
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
it('should return an empty array for unmatched patterns', () => {
|
|
125
|
+
const queryParser = {
|
|
126
|
+
type: 'tokenized',
|
|
127
|
+
params: { language: 'French', pattern: '[\\d.]+', min: 1 }, // only digits
|
|
128
|
+
} as SearchInfo['queryParser'];
|
|
129
|
+
|
|
130
|
+
const result = tokenizeQuery('No numbers here!', queryParser);
|
|
131
|
+
expect(result).toEqual([]);
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
describe('tokenizeIndex', () => {
|
|
136
|
+
it('should filter out words shorter than the required minimum length', () => {
|
|
137
|
+
const fieldInfo = mockSearchInfo.fields[0];
|
|
138
|
+
|
|
139
|
+
const result = tokenizeIndex('The bus is so slow', fieldInfo);
|
|
140
|
+
expect(result).toEqual(['the', 'bus', 'slow']);
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
it('should tokenize and apply synonyms', () => {
|
|
144
|
+
const fieldInfo = mockSearchInfo.fields[0];
|
|
145
|
+
|
|
146
|
+
const result = tokenizeIndex('The car is fast', fieldInfo);
|
|
147
|
+
expect(result).toEqual(['the', 'car', 'vehicle', 'automobile', 'fast']);
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
it('should normalize the input', () => {
|
|
151
|
+
const fieldInfo = mockSearchInfo.fields[0];
|
|
152
|
+
|
|
153
|
+
const result = tokenizeIndex('Élève Étudiant!', fieldInfo);
|
|
154
|
+
expect(result).toEqual(['eleve', 'etudiant']);
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
it('should filter out stopWords', () => {
|
|
158
|
+
const fieldInfo = { ...mockSearchInfo.fields[0], min: 1 };
|
|
159
|
+
const stopWords = ['is', 'the', 'of', 'this', 'a'];
|
|
160
|
+
|
|
161
|
+
const result = tokenizeIndex(
|
|
162
|
+
'This is a test of stopWords !',
|
|
163
|
+
fieldInfo,
|
|
164
|
+
stopWords
|
|
165
|
+
);
|
|
166
|
+
expect(result).toEqual(['test', 'stopwords']);
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
it('should return an empty array for unmatched patterns', () => {
|
|
170
|
+
const fieldInfo = { ...mockSearchInfo.fields[0], rules: ['\\d+'] }; // Only numbers
|
|
171
|
+
|
|
172
|
+
const result = tokenizeIndex('No numbers here', fieldInfo);
|
|
173
|
+
expect(result).toEqual([]);
|
|
174
|
+
});
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
describe('tokenizer', () => {
|
|
178
|
+
it('should tokenize using field rules', () => {
|
|
179
|
+
const tokenize = tokenizer(mockSearchInfo);
|
|
180
|
+
|
|
181
|
+
const result = tokenize('The car is fast', 'title');
|
|
182
|
+
expect(result).toEqual(['the', 'car', 'vehicle', 'automobile', 'fast']);
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
it('should tokenize using query parser when field is not found', () => {
|
|
186
|
+
const tokenize = tokenizer(mockSearchInfo);
|
|
187
|
+
|
|
188
|
+
const result = tokenize('This is a test!');
|
|
189
|
+
expect(result).toEqual(['this', 'is', 'test']);
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
it('should normalize the input', () => {
|
|
193
|
+
const tokenize = tokenizer(mockSearchInfo);
|
|
194
|
+
|
|
195
|
+
const result = tokenize('Élève Étudiant!');
|
|
196
|
+
expect(result).toEqual(['eleve', 'etudiant']);
|
|
197
|
+
});
|
|
198
|
+
|
|
199
|
+
it('should handle empty strings', () => {
|
|
200
|
+
const tokenize = tokenizer(mockSearchInfo);
|
|
201
|
+
|
|
202
|
+
expect(tokenize('', 'title')).toEqual([]);
|
|
203
|
+
expect(tokenize('')).toEqual([]);
|
|
204
|
+
});
|
|
205
|
+
});
|
|
@@ -2,14 +2,16 @@ import type { SearchInfo } from './SearchInterface';
|
|
|
2
2
|
import type { ItemOf } from '../../type.utils';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
|
-
* Generates a tokenize method
|
|
5
|
+
* Generates a tokenize method.
|
|
6
|
+
* When used for tokenizing a search query instead of the indexing, the fieldName is undefined.
|
|
6
7
|
*/
|
|
7
8
|
export const tokenizer =
|
|
8
9
|
(info: SearchInfo) => (str: string, fieldName?: string) => {
|
|
9
10
|
const field = info.fields.find((f) => f.name === fieldName);
|
|
11
|
+
const stopWords = info.stopWords;
|
|
10
12
|
|
|
11
13
|
return field
|
|
12
|
-
? tokenizeIndex(str, field)
|
|
14
|
+
? tokenizeIndex(str, field, stopWords)
|
|
13
15
|
: tokenizeQuery(str, info.queryParser);
|
|
14
16
|
};
|
|
15
17
|
|
|
@@ -25,8 +27,8 @@ export const tokenizeQuery = (str: string, info: SearchInfo['queryParser']) => {
|
|
|
25
27
|
|
|
26
28
|
const wordRegex =
|
|
27
29
|
info.params.pattern && info.params.pattern !== 'soft'
|
|
28
|
-
?
|
|
29
|
-
:
|
|
30
|
+
? new RegExp(info.params.pattern, 'gi')
|
|
31
|
+
: /\w+/gi;
|
|
30
32
|
const minLength = info.params.min ?? 1;
|
|
31
33
|
|
|
32
34
|
return (
|
|
@@ -41,12 +43,13 @@ export const tokenizeQuery = (str: string, info: SearchInfo['queryParser']) => {
|
|
|
41
43
|
*/
|
|
42
44
|
export const tokenizeIndex = (
|
|
43
45
|
str: string,
|
|
44
|
-
info: ItemOf<SearchInfo['fields']
|
|
46
|
+
info: ItemOf<SearchInfo['fields']>,
|
|
47
|
+
stopWords?: string[]
|
|
45
48
|
) => {
|
|
46
49
|
const wordRegex =
|
|
47
50
|
info.rules && info.rules !== 'soft'
|
|
48
|
-
?
|
|
49
|
-
:
|
|
51
|
+
? new RegExp(info.rules![0], 'gi')
|
|
52
|
+
: /\w+/gi;
|
|
50
53
|
const minLength = info.min ?? 1;
|
|
51
54
|
|
|
52
55
|
// For synonyms, add the synonyms to the string
|
|
@@ -57,8 +60,9 @@ export const tokenizeIndex = (
|
|
|
57
60
|
}
|
|
58
61
|
}
|
|
59
62
|
|
|
63
|
+
// We remove the stopWords from the string
|
|
60
64
|
return (
|
|
61
|
-
normalizeStr(str)
|
|
65
|
+
filterStopWords(normalizeStr(str), stopWords)
|
|
62
66
|
.match(wordRegex)
|
|
63
67
|
?.filter((w) => w.length >= minLength) ?? []
|
|
64
68
|
);
|
|
@@ -75,3 +79,18 @@ const normalizeStr = (str: string) => {
|
|
|
75
79
|
.replace(/[\u0300-\u036f]/g, '')
|
|
76
80
|
.toLowerCase();
|
|
77
81
|
};
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* remove from a string all the words that are included in a stopwords list
|
|
85
|
+
*/
|
|
86
|
+
export function filterStopWords(input: string, stopWords?: string[]): string {
|
|
87
|
+
if (!stopWords) {
|
|
88
|
+
return input;
|
|
89
|
+
}
|
|
90
|
+
const lowerCaseStopWords = stopWords.map((word) => word.toLowerCase());
|
|
91
|
+
const words = input.split(/\s+/);
|
|
92
|
+
const filteredWords = words.filter(
|
|
93
|
+
(word) => !lowerCaseStopWords.includes(word.toLowerCase())
|
|
94
|
+
);
|
|
95
|
+
return filteredWords.join(' ');
|
|
96
|
+
}
|