@toxplanet/pegasus-sdk 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/environment.acc.js +27 -0
- package/config/environment.dev.js +6 -1
- package/config/environment.prod.js +6 -1
- package/config/environment.qa.js +5 -0
- package/config/index.js +1 -1
- package/index.js +44 -37
- package/lib/chemicals.js +293 -229
- package/lib/connection.js +223 -223
- package/lib/documents.js +94 -47
- package/lib/elasticsearch.js +404 -0
- package/lib/search.js +336 -307
- package/lib/sync.js +43 -41
- package/lib/utils.js +49 -47
- package/package.json +48 -25
- package/env.example +0 -21
- package/index.d.ts +0 -252
- package/tests/chemicals.js +0 -165
- package/tests/search.js +0 -138
package/lib/search.js
CHANGED
|
@@ -1,307 +1,336 @@
|
|
|
1
|
-
const { logInfo, logError } = require('@toxplanet/tphelper/logging');
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Detect if query looks like a CAS number (numbers with - or / separators)
|
|
5
|
-
* Returns array of alternative formats to try
|
|
6
|
-
*
|
|
7
|
-
* CAS format: XXXXXXX-XX-X (registry-sequence-check)
|
|
8
|
-
* - First part: 2-7 digits (registry number)
|
|
9
|
-
* - Second part: ALWAYS 2 digits (zero-padded)
|
|
10
|
-
* - Third part: ALWAYS 1 digit (check digit)
|
|
11
|
-
*/
|
|
12
|
-
function getCasNumberVariations(query) {
|
|
13
|
-
// Check if it's all numbers and separators (-, /)
|
|
14
|
-
if (!/^[\d\-\/]+$/.test(query)) {
|
|
15
|
-
return [query]; // Not a CAS-like format, return as-is
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
// Check if it has at least one separator
|
|
19
|
-
if (!query.includes('-') && !query.includes('/')) {
|
|
20
|
-
return [query]; // No separator, return as-is
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
// Generate variations
|
|
24
|
-
const variations = new Set();
|
|
25
|
-
|
|
26
|
-
// Add original
|
|
27
|
-
variations.add(query);
|
|
28
|
-
|
|
29
|
-
// Split by any separator
|
|
30
|
-
const parts = query.split(/[-\/]/);
|
|
31
|
-
|
|
32
|
-
if (parts.length === 3) {
|
|
33
|
-
// Three parts detected
|
|
34
|
-
// Could be:
|
|
35
|
-
// 1. Standard: registry-sequence-check (e.g., "7440-06-4")
|
|
36
|
-
// 2. Reversed: sequence/check/registry (e.g., "6/4/7440")
|
|
37
|
-
|
|
38
|
-
const [part1, part2, part3] = parts;
|
|
39
|
-
|
|
40
|
-
// Check if it looks like reversed format
|
|
41
|
-
// (small first part, small second part, large third part)
|
|
42
|
-
const isLikelyReversed = part1.length <= 2 && part2.length <= 2 && part3.length >= 3;
|
|
43
|
-
|
|
44
|
-
if (isLikelyReversed) {
|
|
45
|
-
// Format: sequence/check/registry → registry-sequence-check
|
|
46
|
-
const registry = part3;
|
|
47
|
-
const sequence = part1.padStart(2, '0'); // Zero-pad to 2 digits
|
|
48
|
-
const check = part2;
|
|
49
|
-
|
|
50
|
-
// Add properly formatted CAS
|
|
51
|
-
variations.add(`${registry}-${sequence}-${check}`);
|
|
52
|
-
variations.add(`${registry}/${sequence}/${check}`);
|
|
53
|
-
} else {
|
|
54
|
-
// Looks like standard format, try both separators
|
|
55
|
-
const registry = part1;
|
|
56
|
-
const sequence = part2.padStart(2, '0'); // Ensure 2 digits
|
|
57
|
-
const check = part3;
|
|
58
|
-
|
|
59
|
-
variations.add(`${registry}-${sequence}-${check}`);
|
|
60
|
-
variations.add(`${registry}/${sequence}/${check}`);
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
// Also try with different separator on original parts
|
|
64
|
-
variations.add(parts.join('-'));
|
|
65
|
-
variations.add(parts.join('/'));
|
|
66
|
-
} else if (parts.length === 2) {
|
|
67
|
-
// Two parts - just try both separators
|
|
68
|
-
variations.add(parts.join('-'));
|
|
69
|
-
variations.add(parts.join('/'));
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
return Array.from(variations);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
class SearchService {
|
|
76
|
-
constructor(connection) {
|
|
77
|
-
this.connection = connection;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Search chemicals using OpenSearch with configurable boost parameters
|
|
82
|
-
* @param {string} query - Search query string
|
|
83
|
-
* @param {Object} options - Search options
|
|
84
|
-
* @param {number} options.limit - Maximum number of results (default: 10)
|
|
85
|
-
* @param {number} options.casExact - Boost for exact CAS matches (default: 50)
|
|
86
|
-
* @param {number} options.casPrefix - Boost for CAS prefix matches (default: 10)
|
|
87
|
-
* @param {number} options.nameExact - Boost for exact name matches (default: 40)
|
|
88
|
-
* @param {number} options.namePrefix - Boost for name prefix matches (default: 8)
|
|
89
|
-
* @param {number} options.identifierExact - Boost for exact identifier matches (default: 30)
|
|
90
|
-
* @param {number} options.identifierPrefix - Boost for identifier prefix matches (default: 5)
|
|
91
|
-
* @param {number} options.synonymExact - Boost for exact synonym matches (default: 100)
|
|
92
|
-
* @param {number} options.synonymPrefix - Boost for synonym prefix matches (default: 3)
|
|
93
|
-
* @returns {Promise<Object>} Search results with chemicals array
|
|
94
|
-
*/
|
|
95
|
-
async searchChemicals(query, options = {}) {
|
|
96
|
-
if (!query) {
|
|
97
|
-
return { results: [] };
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
// Extract options with defaults
|
|
101
|
-
const limit = options.limit || 10;
|
|
102
|
-
const casExact = options.casExact !== undefined ? options.casExact : 50;
|
|
103
|
-
const casPrefix = options.casPrefix !== undefined ? options.casPrefix : 10;
|
|
104
|
-
const nameExact = options.nameExact !== undefined ? options.nameExact : 40;
|
|
105
|
-
const namePrefix = options.namePrefix !== undefined ? options.namePrefix : 8;
|
|
106
|
-
const identifierExact = options.identifierExact !== undefined ? options.identifierExact : 30;
|
|
107
|
-
const identifierPrefix = options.identifierPrefix !== undefined ? options.identifierPrefix : 5;
|
|
108
|
-
const synonymExact = options.synonymExact !== undefined ? options.synonymExact : 100;
|
|
109
|
-
const synonymPrefix = options.synonymPrefix !== undefined ? options.synonymPrefix : 3;
|
|
110
|
-
|
|
111
|
-
try {
|
|
112
|
-
const opensearchClient = this.connection.getOpenSearchClient();
|
|
113
|
-
|
|
114
|
-
// Get CAS number variations (if applicable)
|
|
115
|
-
const queryVariations = getCasNumberVariations(query);
|
|
116
|
-
|
|
117
|
-
// Log if we're trying multiple variations
|
|
118
|
-
if (queryVariations.length > 1) {
|
|
119
|
-
logInfo('pegasus-sdk', `CAS format detection: trying ${queryVariations.length} variations for "${query}": ${JSON.stringify(queryVariations)}`);
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
// Build should clauses for all query variations
|
|
123
|
-
const shouldClauses = [];
|
|
124
|
-
|
|
125
|
-
for (const queryVariation of queryVariations) {
|
|
126
|
-
// Exact matches (configurable priority)
|
|
127
|
-
shouldClauses.push(
|
|
128
|
-
{ term: { 'cas_numbers': { value: queryVariation, boost: casExact } } },
|
|
129
|
-
{ term: { 'chemical_name.keyword': { value: queryVariation, boost: nameExact, case_insensitive: true } } },
|
|
130
|
-
{ term: { 'identifier_values': { value: queryVariation, boost: identifierExact } } },
|
|
131
|
-
{ term: { 'synonyms.keyword': { value: queryVariation, boost: synonymExact, case_insensitive: true } } },
|
|
132
|
-
// Prefix matches (configurable priority)
|
|
133
|
-
{ prefix: { 'cas_numbers': { value: queryVariation, boost: casPrefix } } },
|
|
134
|
-
{ prefix: { 'chemical_name.keyword': { value: queryVariation, boost: namePrefix, case_insensitive: true } } },
|
|
135
|
-
{ prefix: { 'identifier_values': { value: queryVariation, boost: identifierPrefix } } },
|
|
136
|
-
{ prefix: { 'synonyms.keyword': { value: queryVariation, boost: synonymPrefix, case_insensitive: true } } }
|
|
137
|
-
);
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
const indexName = this.connection.getOpenSearchIndex();
|
|
141
|
-
|
|
142
|
-
const response = await opensearchClient.search({
|
|
143
|
-
index: indexName,
|
|
144
|
-
body: {
|
|
145
|
-
size: limit,
|
|
146
|
-
query: {
|
|
147
|
-
bool: {
|
|
148
|
-
should: shouldClauses,
|
|
149
|
-
minimum_should_match: 1
|
|
150
|
-
}
|
|
151
|
-
},
|
|
152
|
-
_source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
|
|
153
|
-
}
|
|
154
|
-
});
|
|
155
|
-
|
|
156
|
-
const hits = response.body?.hits?.hits || [];
|
|
157
|
-
const results = hits.map((hit) => ({
|
|
158
|
-
id: hit._source.postgres_id,
|
|
159
|
-
name: hit._source.chemical_name,
|
|
160
|
-
cas: hit._source.cas_numbers || [],
|
|
161
|
-
identifiers: hit._source.identifier_values || [],
|
|
162
|
-
synonyms: hit._source.synonyms || [],
|
|
163
|
-
score: hit._score
|
|
164
|
-
}));
|
|
165
|
-
|
|
166
|
-
return { results };
|
|
167
|
-
} catch (error) {
|
|
168
|
-
logError('pegasus-sdk', 'SearchService', 'searchChemicals', error);
|
|
169
|
-
throw error;
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
/**
|
|
174
|
-
* Search for chemicals with prefix matching priority
|
|
175
|
-
* @param {string} searchTerm - Search term
|
|
176
|
-
* @param {number} limit - Maximum number of results (default: 10)
|
|
177
|
-
* @returns {Promise<Object>} Search results
|
|
178
|
-
*/
|
|
179
|
-
async searchStartsWith(searchTerm, limit = 10) {
|
|
180
|
-
return this.searchChemicals(searchTerm, {
|
|
181
|
-
limit,
|
|
182
|
-
// Prioritize prefix matches over exact matches
|
|
183
|
-
casPrefix: 50,
|
|
184
|
-
casExact: 20,
|
|
185
|
-
namePrefix: 40,
|
|
186
|
-
nameExact: 15,
|
|
187
|
-
identifierPrefix: 30,
|
|
188
|
-
identifierExact: 10,
|
|
189
|
-
synonymPrefix: 35,
|
|
190
|
-
synonymExact: 10
|
|
191
|
-
});
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
/**
|
|
195
|
-
* Search for chemicals (alias for general search)
|
|
196
|
-
* @param {string} searchTerm - Search term
|
|
197
|
-
* @param {number} limit - Maximum number of results (default: 10)
|
|
198
|
-
* @returns {Promise<Object>} Search results
|
|
199
|
-
*/
|
|
200
|
-
async searchContains(searchTerm, limit = 10) {
|
|
201
|
-
// Use default balanced weights for contains search
|
|
202
|
-
return this.searchChemicals(searchTerm, { limit });
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
/**
|
|
206
|
-
* Search for chemicals with exact matching priority
|
|
207
|
-
* @param {string} searchTerm - Search term
|
|
208
|
-
* @param {number} limit - Maximum number of results (default: 10)
|
|
209
|
-
* @returns {Promise<Object>} Search results
|
|
210
|
-
*/
|
|
211
|
-
async searchExact(searchTerm, limit = 10) {
|
|
212
|
-
return this.searchChemicals(searchTerm, {
|
|
213
|
-
limit,
|
|
214
|
-
// Prioritize exact matches, minimize prefix matches
|
|
215
|
-
casExact: 100,
|
|
216
|
-
casPrefix: 1,
|
|
217
|
-
nameExact: 80,
|
|
218
|
-
namePrefix: 1,
|
|
219
|
-
identifierExact: 60,
|
|
220
|
-
identifierPrefix: 1,
|
|
221
|
-
synonymExact: 150,
|
|
222
|
-
synonymPrefix: 1
|
|
223
|
-
});
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
/**
|
|
227
|
-
* Search for chemicals by CAS number
|
|
228
|
-
* @param {string} casNumber - CAS number to search for
|
|
229
|
-
* @param {string} searchType - Search type: 'exact' or 'prefix' (default: 'exact')
|
|
230
|
-
* @returns {Promise<Object>} Search results
|
|
231
|
-
*/
|
|
232
|
-
async searchByCAS(casNumber, searchType = 'exact') {
|
|
233
|
-
const isExact = searchType === 'exact';
|
|
234
|
-
return this.searchChemicals(casNumber, {
|
|
235
|
-
limit: 10,
|
|
236
|
-
// Heavily prioritize CAS field
|
|
237
|
-
casExact: isExact ? 200 : 50,
|
|
238
|
-
casPrefix: isExact ? 10 : 100,
|
|
239
|
-
nameExact: 5,
|
|
240
|
-
namePrefix: 1,
|
|
241
|
-
identifierExact: 5,
|
|
242
|
-
identifierPrefix: 1,
|
|
243
|
-
synonymExact: 5,
|
|
244
|
-
synonymPrefix: 1
|
|
245
|
-
});
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
/**
|
|
249
|
-
* Search for chemicals by identifier value
|
|
250
|
-
* @param {string} identifierValue - Identifier value to search for
|
|
251
|
-
* @param {string} searchType - Search type: 'exact' or 'prefix' (default: 'exact')
|
|
252
|
-
* @returns {Promise<Object>} Search results
|
|
253
|
-
*/
|
|
254
|
-
async searchByIdentifier(identifierValue, searchType = 'exact') {
|
|
255
|
-
const isExact = searchType === 'exact';
|
|
256
|
-
return this.searchChemicals(identifierValue, {
|
|
257
|
-
limit: 10,
|
|
258
|
-
// Heavily prioritize identifier field
|
|
259
|
-
identifierExact: isExact ? 200 : 50,
|
|
260
|
-
identifierPrefix: isExact ? 10 : 100,
|
|
261
|
-
casExact: 10,
|
|
262
|
-
casPrefix: 5,
|
|
263
|
-
nameExact: 5,
|
|
264
|
-
namePrefix: 1,
|
|
265
|
-
synonymExact: 5,
|
|
266
|
-
synonymPrefix: 1
|
|
267
|
-
});
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
/**
|
|
271
|
-
* Search for chemicals by synonym
|
|
272
|
-
* @param {string} synonymTerm - Synonym term to search for
|
|
273
|
-
* @param {string} searchType - Search type: 'exact' or 'prefix' (default: 'exact')
|
|
274
|
-
* @returns {Promise<Object>} Search results
|
|
275
|
-
*/
|
|
276
|
-
async searchBySynonym(synonymTerm, searchType = 'exact') {
|
|
277
|
-
const isExact = searchType === 'exact';
|
|
278
|
-
return this.searchChemicals(synonymTerm, {
|
|
279
|
-
limit: 10,
|
|
280
|
-
// Heavily prioritize synonym field
|
|
281
|
-
synonymExact: isExact ? 200 : 50,
|
|
282
|
-
synonymPrefix: isExact ? 10 : 100,
|
|
283
|
-
nameExact: 20,
|
|
284
|
-
namePrefix: 5,
|
|
285
|
-
casExact: 10,
|
|
286
|
-
casPrefix: 5,
|
|
287
|
-
identifierExact: 5,
|
|
288
|
-
identifierPrefix: 1
|
|
289
|
-
});
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
async advancedSearch(queryBuilder) {}
|
|
293
|
-
|
|
294
|
-
async searchWithFilters(searchTerm, filters, limit) {}
|
|
295
|
-
|
|
296
|
-
async searchByCollection(collectionName, searchTerm, limit) {}
|
|
297
|
-
|
|
298
|
-
async aggregateByCategory() {}
|
|
299
|
-
|
|
300
|
-
async aggregateByIdentifierType() {}
|
|
301
|
-
|
|
302
|
-
async getSearchSuggestions(partialTerm, limit) {}
|
|
303
|
-
|
|
304
|
-
async findSimilarChemicals(chemicalId, limit) {}
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
1
|
+
const { logInfo, logError } = require('@toxplanet/tphelper/logging');
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Detect if query looks like a CAS number (numbers with - or / separators)
|
|
5
|
+
* Returns array of alternative formats to try
|
|
6
|
+
*
|
|
7
|
+
* CAS format: XXXXXXX-XX-X (registry-sequence-check)
|
|
8
|
+
* - First part: 2-7 digits (registry number)
|
|
9
|
+
* - Second part: ALWAYS 2 digits (zero-padded)
|
|
10
|
+
* - Third part: ALWAYS 1 digit (check digit)
|
|
11
|
+
*/
|
|
12
|
+
function getCasNumberVariations(query) {
|
|
13
|
+
// Check if it's all numbers and separators (-, /)
|
|
14
|
+
if (!/^[\d\-\/]+$/.test(query)) {
|
|
15
|
+
return [query]; // Not a CAS-like format, return as-is
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
// Check if it has at least one separator
|
|
19
|
+
if (!query.includes('-') && !query.includes('/')) {
|
|
20
|
+
return [query]; // No separator, return as-is
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Generate variations
|
|
24
|
+
const variations = new Set();
|
|
25
|
+
|
|
26
|
+
// Add original
|
|
27
|
+
variations.add(query);
|
|
28
|
+
|
|
29
|
+
// Split by any separator
|
|
30
|
+
const parts = query.split(/[-\/]/);
|
|
31
|
+
|
|
32
|
+
if (parts.length === 3) {
|
|
33
|
+
// Three parts detected
|
|
34
|
+
// Could be:
|
|
35
|
+
// 1. Standard: registry-sequence-check (e.g., "7440-06-4")
|
|
36
|
+
// 2. Reversed: sequence/check/registry (e.g., "6/4/7440")
|
|
37
|
+
|
|
38
|
+
const [part1, part2, part3] = parts;
|
|
39
|
+
|
|
40
|
+
// Check if it looks like reversed format
|
|
41
|
+
// (small first part, small second part, large third part)
|
|
42
|
+
const isLikelyReversed = part1.length <= 2 && part2.length <= 2 && part3.length >= 3;
|
|
43
|
+
|
|
44
|
+
if (isLikelyReversed) {
|
|
45
|
+
// Format: sequence/check/registry → registry-sequence-check
|
|
46
|
+
const registry = part3;
|
|
47
|
+
const sequence = part1.padStart(2, '0'); // Zero-pad to 2 digits
|
|
48
|
+
const check = part2;
|
|
49
|
+
|
|
50
|
+
// Add properly formatted CAS
|
|
51
|
+
variations.add(`${registry}-${sequence}-${check}`);
|
|
52
|
+
variations.add(`${registry}/${sequence}/${check}`);
|
|
53
|
+
} else {
|
|
54
|
+
// Looks like standard format, try both separators
|
|
55
|
+
const registry = part1;
|
|
56
|
+
const sequence = part2.padStart(2, '0'); // Ensure 2 digits
|
|
57
|
+
const check = part3;
|
|
58
|
+
|
|
59
|
+
variations.add(`${registry}-${sequence}-${check}`);
|
|
60
|
+
variations.add(`${registry}/${sequence}/${check}`);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Also try with different separator on original parts
|
|
64
|
+
variations.add(parts.join('-'));
|
|
65
|
+
variations.add(parts.join('/'));
|
|
66
|
+
} else if (parts.length === 2) {
|
|
67
|
+
// Two parts - just try both separators
|
|
68
|
+
variations.add(parts.join('-'));
|
|
69
|
+
variations.add(parts.join('/'));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return Array.from(variations);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
class SearchService {
|
|
76
|
+
constructor(connection) {
|
|
77
|
+
this.connection = connection;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Search chemicals using OpenSearch with configurable boost parameters
|
|
82
|
+
* @param {string} query - Search query string
|
|
83
|
+
* @param {Object} options - Search options
|
|
84
|
+
* @param {number} options.limit - Maximum number of results (default: 10)
|
|
85
|
+
* @param {number} options.casExact - Boost for exact CAS matches (default: 50)
|
|
86
|
+
* @param {number} options.casPrefix - Boost for CAS prefix matches (default: 10)
|
|
87
|
+
* @param {number} options.nameExact - Boost for exact name matches (default: 40)
|
|
88
|
+
* @param {number} options.namePrefix - Boost for name prefix matches (default: 8)
|
|
89
|
+
* @param {number} options.identifierExact - Boost for exact identifier matches (default: 30)
|
|
90
|
+
* @param {number} options.identifierPrefix - Boost for identifier prefix matches (default: 5)
|
|
91
|
+
* @param {number} options.synonymExact - Boost for exact synonym matches (default: 100)
|
|
92
|
+
* @param {number} options.synonymPrefix - Boost for synonym prefix matches (default: 3)
|
|
93
|
+
* @returns {Promise<Object>} Search results with chemicals array
|
|
94
|
+
*/
|
|
95
|
+
async searchChemicals(query, options = {}) {
|
|
96
|
+
if (!query) {
|
|
97
|
+
return { results: [] };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Extract options with defaults
|
|
101
|
+
const limit = options.limit || 10;
|
|
102
|
+
const casExact = options.casExact !== undefined ? options.casExact : 50;
|
|
103
|
+
const casPrefix = options.casPrefix !== undefined ? options.casPrefix : 10;
|
|
104
|
+
const nameExact = options.nameExact !== undefined ? options.nameExact : 40;
|
|
105
|
+
const namePrefix = options.namePrefix !== undefined ? options.namePrefix : 8;
|
|
106
|
+
const identifierExact = options.identifierExact !== undefined ? options.identifierExact : 30;
|
|
107
|
+
const identifierPrefix = options.identifierPrefix !== undefined ? options.identifierPrefix : 5;
|
|
108
|
+
const synonymExact = options.synonymExact !== undefined ? options.synonymExact : 100;
|
|
109
|
+
const synonymPrefix = options.synonymPrefix !== undefined ? options.synonymPrefix : 3;
|
|
110
|
+
|
|
111
|
+
try {
|
|
112
|
+
const opensearchClient = this.connection.getOpenSearchClient();
|
|
113
|
+
|
|
114
|
+
// Get CAS number variations (if applicable)
|
|
115
|
+
const queryVariations = getCasNumberVariations(query);
|
|
116
|
+
|
|
117
|
+
// Log if we're trying multiple variations
|
|
118
|
+
if (queryVariations.length > 1) {
|
|
119
|
+
logInfo('pegasus-sdk', `CAS format detection: trying ${queryVariations.length} variations for "${query}": ${JSON.stringify(queryVariations)}`);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Build should clauses for all query variations
|
|
123
|
+
const shouldClauses = [];
|
|
124
|
+
|
|
125
|
+
for (const queryVariation of queryVariations) {
|
|
126
|
+
// Exact matches (configurable priority)
|
|
127
|
+
shouldClauses.push(
|
|
128
|
+
{ term: { 'cas_numbers': { value: queryVariation, boost: casExact } } },
|
|
129
|
+
{ term: { 'chemical_name.keyword': { value: queryVariation, boost: nameExact, case_insensitive: true } } },
|
|
130
|
+
{ term: { 'identifier_values': { value: queryVariation, boost: identifierExact } } },
|
|
131
|
+
{ term: { 'synonyms.keyword': { value: queryVariation, boost: synonymExact, case_insensitive: true } } },
|
|
132
|
+
// Prefix matches (configurable priority)
|
|
133
|
+
{ prefix: { 'cas_numbers': { value: queryVariation, boost: casPrefix } } },
|
|
134
|
+
{ prefix: { 'chemical_name.keyword': { value: queryVariation, boost: namePrefix, case_insensitive: true } } },
|
|
135
|
+
{ prefix: { 'identifier_values': { value: queryVariation, boost: identifierPrefix } } },
|
|
136
|
+
{ prefix: { 'synonyms.keyword': { value: queryVariation, boost: synonymPrefix, case_insensitive: true } } }
|
|
137
|
+
);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const indexName = this.connection.getOpenSearchIndex();
|
|
141
|
+
|
|
142
|
+
const response = await opensearchClient.search({
|
|
143
|
+
index: indexName,
|
|
144
|
+
body: {
|
|
145
|
+
size: limit,
|
|
146
|
+
query: {
|
|
147
|
+
bool: {
|
|
148
|
+
should: shouldClauses,
|
|
149
|
+
minimum_should_match: 1
|
|
150
|
+
}
|
|
151
|
+
},
|
|
152
|
+
_source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
|
|
153
|
+
}
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
const hits = response.body?.hits?.hits || [];
|
|
157
|
+
const results = hits.map((hit) => ({
|
|
158
|
+
id: hit._source.postgres_id,
|
|
159
|
+
name: hit._source.chemical_name,
|
|
160
|
+
cas: hit._source.cas_numbers || [],
|
|
161
|
+
identifiers: hit._source.identifier_values || [],
|
|
162
|
+
synonyms: hit._source.synonyms || [],
|
|
163
|
+
score: hit._score
|
|
164
|
+
}));
|
|
165
|
+
|
|
166
|
+
return { results };
|
|
167
|
+
} catch (error) {
|
|
168
|
+
logError('pegasus-sdk', 'SearchService', 'searchChemicals', error);
|
|
169
|
+
throw error;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Search for chemicals with prefix matching priority
|
|
175
|
+
* @param {string} searchTerm - Search term
|
|
176
|
+
* @param {number} limit - Maximum number of results (default: 10)
|
|
177
|
+
* @returns {Promise<Object>} Search results
|
|
178
|
+
*/
|
|
179
|
+
async searchStartsWith(searchTerm, limit = 10) {
|
|
180
|
+
return this.searchChemicals(searchTerm, {
|
|
181
|
+
limit,
|
|
182
|
+
// Prioritize prefix matches over exact matches
|
|
183
|
+
casPrefix: 50,
|
|
184
|
+
casExact: 20,
|
|
185
|
+
namePrefix: 40,
|
|
186
|
+
nameExact: 15,
|
|
187
|
+
identifierPrefix: 30,
|
|
188
|
+
identifierExact: 10,
|
|
189
|
+
synonymPrefix: 35,
|
|
190
|
+
synonymExact: 10
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Search for chemicals (alias for general search)
|
|
196
|
+
* @param {string} searchTerm - Search term
|
|
197
|
+
* @param {number} limit - Maximum number of results (default: 10)
|
|
198
|
+
* @returns {Promise<Object>} Search results
|
|
199
|
+
*/
|
|
200
|
+
async searchContains(searchTerm, limit = 10) {
|
|
201
|
+
// Use default balanced weights for contains search
|
|
202
|
+
return this.searchChemicals(searchTerm, { limit });
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Search for chemicals with exact matching priority
|
|
207
|
+
* @param {string} searchTerm - Search term
|
|
208
|
+
* @param {number} limit - Maximum number of results (default: 10)
|
|
209
|
+
* @returns {Promise<Object>} Search results
|
|
210
|
+
*/
|
|
211
|
+
async searchExact(searchTerm, limit = 10) {
|
|
212
|
+
return this.searchChemicals(searchTerm, {
|
|
213
|
+
limit,
|
|
214
|
+
// Prioritize exact matches, minimize prefix matches
|
|
215
|
+
casExact: 100,
|
|
216
|
+
casPrefix: 1,
|
|
217
|
+
nameExact: 80,
|
|
218
|
+
namePrefix: 1,
|
|
219
|
+
identifierExact: 60,
|
|
220
|
+
identifierPrefix: 1,
|
|
221
|
+
synonymExact: 150,
|
|
222
|
+
synonymPrefix: 1
|
|
223
|
+
});
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Search for chemicals by CAS number
|
|
228
|
+
* @param {string} casNumber - CAS number to search for
|
|
229
|
+
* @param {string} searchType - Search type: 'exact' or 'prefix' (default: 'exact')
|
|
230
|
+
* @returns {Promise<Object>} Search results
|
|
231
|
+
*/
|
|
232
|
+
async searchByCAS(casNumber, searchType = 'exact') {
|
|
233
|
+
const isExact = searchType === 'exact';
|
|
234
|
+
return this.searchChemicals(casNumber, {
|
|
235
|
+
limit: 10,
|
|
236
|
+
// Heavily prioritize CAS field
|
|
237
|
+
casExact: isExact ? 200 : 50,
|
|
238
|
+
casPrefix: isExact ? 10 : 100,
|
|
239
|
+
nameExact: 5,
|
|
240
|
+
namePrefix: 1,
|
|
241
|
+
identifierExact: 5,
|
|
242
|
+
identifierPrefix: 1,
|
|
243
|
+
synonymExact: 5,
|
|
244
|
+
synonymPrefix: 1
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Search for chemicals by identifier value
|
|
250
|
+
* @param {string} identifierValue - Identifier value to search for
|
|
251
|
+
* @param {string} searchType - Search type: 'exact' or 'prefix' (default: 'exact')
|
|
252
|
+
* @returns {Promise<Object>} Search results
|
|
253
|
+
*/
|
|
254
|
+
async searchByIdentifier(identifierValue, searchType = 'exact') {
|
|
255
|
+
const isExact = searchType === 'exact';
|
|
256
|
+
return this.searchChemicals(identifierValue, {
|
|
257
|
+
limit: 10,
|
|
258
|
+
// Heavily prioritize identifier field
|
|
259
|
+
identifierExact: isExact ? 200 : 50,
|
|
260
|
+
identifierPrefix: isExact ? 10 : 100,
|
|
261
|
+
casExact: 10,
|
|
262
|
+
casPrefix: 5,
|
|
263
|
+
nameExact: 5,
|
|
264
|
+
namePrefix: 1,
|
|
265
|
+
synonymExact: 5,
|
|
266
|
+
synonymPrefix: 1
|
|
267
|
+
});
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Search for chemicals by synonym
|
|
272
|
+
* @param {string} synonymTerm - Synonym term to search for
|
|
273
|
+
* @param {string} searchType - Search type: 'exact' or 'prefix' (default: 'exact')
|
|
274
|
+
* @returns {Promise<Object>} Search results
|
|
275
|
+
*/
|
|
276
|
+
async searchBySynonym(synonymTerm, searchType = 'exact') {
|
|
277
|
+
const isExact = searchType === 'exact';
|
|
278
|
+
return this.searchChemicals(synonymTerm, {
|
|
279
|
+
limit: 10,
|
|
280
|
+
// Heavily prioritize synonym field
|
|
281
|
+
synonymExact: isExact ? 200 : 50,
|
|
282
|
+
synonymPrefix: isExact ? 10 : 100,
|
|
283
|
+
nameExact: 20,
|
|
284
|
+
namePrefix: 5,
|
|
285
|
+
casExact: 10,
|
|
286
|
+
casPrefix: 5,
|
|
287
|
+
identifierExact: 5,
|
|
288
|
+
identifierPrefix: 1
|
|
289
|
+
});
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
async advancedSearch(queryBuilder) {}
|
|
293
|
+
|
|
294
|
+
async searchWithFilters(searchTerm, filters, limit) {}
|
|
295
|
+
|
|
296
|
+
async searchByCollection(collectionName, searchTerm, limit) {}
|
|
297
|
+
|
|
298
|
+
async aggregateByCategory() {}
|
|
299
|
+
|
|
300
|
+
async aggregateByIdentifierType() {}
|
|
301
|
+
|
|
302
|
+
async getSearchSuggestions(partialTerm, limit) {}
|
|
303
|
+
|
|
304
|
+
async findSimilarChemicals(chemicalId, limit) {}
|
|
305
|
+
|
|
306
|
+
registerElasticsearchHandlers(elasticsearchService) {
|
|
307
|
+
const indexPatterns = this.connection.config.indexRoutes?.search || [/^(chemicals|substances|search)/];
|
|
308
|
+
|
|
309
|
+
indexPatterns.forEach(pattern => {
|
|
310
|
+
elasticsearchService.registerIndexRoute(pattern, {
|
|
311
|
+
search: async (params) => {
|
|
312
|
+
const query = params.body?.query;
|
|
313
|
+
const searchTerm = query?.match?.chemical_name ||
|
|
314
|
+
query?.term?.chemical_name ||
|
|
315
|
+
query?.query_string?.query ||
|
|
316
|
+
query?.match_all ? '*' : '';
|
|
317
|
+
const limit = params.body?.size || 10;
|
|
318
|
+
|
|
319
|
+
return await this.searchChemicals(searchTerm, { limit });
|
|
320
|
+
},
|
|
321
|
+
|
|
322
|
+
count: async (params) => {
|
|
323
|
+
const query = params.body?.query;
|
|
324
|
+
const searchTerm = query?.match?.chemical_name ||
|
|
325
|
+
query?.term?.chemical_name ||
|
|
326
|
+
query?.query_string?.query || '';
|
|
327
|
+
|
|
328
|
+
const results = await this.searchChemicals(searchTerm, { limit: 10000 });
|
|
329
|
+
return { count: results.results.length };
|
|
330
|
+
}
|
|
331
|
+
});
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
module.exports = SearchService;
|