@toxplanet/pegasus-sdk 1.2.11 → 1.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/chemicals.js +68 -13
- package/package.json +1 -1
package/lib/chemicals.js
CHANGED
|
@@ -9,6 +9,25 @@ const SEARCH_BOOST_PREFIX_SECONDARY = 10;
|
|
|
9
9
|
|
|
10
10
|
const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'CID', 'DTXSID', 'EINECS', 'EC']);
|
|
11
11
|
|
|
12
|
+
// Heuristic: does the search term look like a chemical identifier (CAS, EC,
|
|
13
|
+
// InChIKey, CID, etc.) rather than a chemical name? If so, restrict to keyword
|
|
14
|
+
// matches so we don't get noise from tokenized text-field matches on names.
|
|
15
|
+
function looksLikeIdentifier(term) {
|
|
16
|
+
if (!term) return false;
|
|
17
|
+
const t = String(term).trim();
|
|
18
|
+
if (!t) return false;
|
|
19
|
+
// CAS / EC: digits with hyphen or slash separators
|
|
20
|
+
if (/^\d{1,7}[-/]\d{1,3}[-/]\d{1,2}$/.test(t)) return true; // CAS xxxxxxx-xx-x
|
|
21
|
+
if (/^\d{2,3}-\d{3}-\d$/.test(t)) return true; // EC xxx-xxx-x
|
|
22
|
+
// InChIKey: 14-10-1 uppercase letters with hyphens
|
|
23
|
+
if (/^[A-Z]{14}-[A-Z]{10}-[A-Z]$/.test(t)) return true;
|
|
24
|
+
// PubChem CID
|
|
25
|
+
if (/^CID\d+$/i.test(t)) return true;
|
|
26
|
+
// DTXSID
|
|
27
|
+
if (/^DTXSID\d+$/i.test(t)) return true;
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
|
|
12
31
|
// Equivalence groups for CDI identifier_key normalization. When any member of
|
|
13
32
|
// a group is present on a row, the others are backfilled with the same values
|
|
14
33
|
// so consumers can look up by their preferred spelling.
|
|
@@ -776,6 +795,10 @@ class ChemicalsService {
|
|
|
776
795
|
query: {
|
|
777
796
|
bool: {
|
|
778
797
|
should: [
|
|
798
|
+
// v2 mapping: cas_numbers / identifier_values are keyword type.
|
|
799
|
+
{ term: { 'cas_numbers': identifierValue } },
|
|
800
|
+
{ term: { 'identifier_values': identifierValue } },
|
|
801
|
+
// Back-compat with legacy mapping where these were dynamically text + .keyword.
|
|
779
802
|
{ term: { 'cas_numbers.keyword': identifierValue } },
|
|
780
803
|
{ term: { 'identifier_values.keyword': identifierValue } }
|
|
781
804
|
],
|
|
@@ -924,6 +947,7 @@ class ChemicalsService {
|
|
|
924
947
|
}
|
|
925
948
|
|
|
926
949
|
try {
|
|
950
|
+
const isIdentifier = looksLikeIdentifier(searchTerm);
|
|
927
951
|
const variations = getCasNumberVariations(searchTerm);
|
|
928
952
|
const shouldClauses = [];
|
|
929
953
|
for (const v of variations) {
|
|
@@ -935,12 +959,22 @@ class ChemicalsService {
|
|
|
935
959
|
{ term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
936
960
|
{ prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
937
961
|
{ term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
938
|
-
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
939
|
-
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
940
|
-
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
941
|
-
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
942
|
-
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
962
|
+
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
943
963
|
);
|
|
964
|
+
// Skip tokenized-text matches for identifier-shaped queries to avoid
|
|
965
|
+
// noise from the analyzer splitting "71-43-2" into 71/43/2 tokens.
|
|
966
|
+
if (!isIdentifier) {
|
|
967
|
+
shouldClauses.push(
|
|
968
|
+
// Whitespace-analyzer fields (preserve hyphenated compound names).
|
|
969
|
+
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
970
|
+
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
971
|
+
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
972
|
+
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
|
|
973
|
+
// Word-level (standard analyzer) sub-fields — let "ethane" still match "trichloroethane".
|
|
974
|
+
{ match: { 'chemical_name.std': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
975
|
+
{ match: { 'synonyms.std': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
976
|
+
);
|
|
977
|
+
}
|
|
944
978
|
}
|
|
945
979
|
|
|
946
980
|
const result = await this.connection.invokeOpenSearch({
|
|
@@ -989,6 +1023,7 @@ class ChemicalsService {
|
|
|
989
1023
|
}
|
|
990
1024
|
|
|
991
1025
|
try {
|
|
1026
|
+
const isIdentifier = looksLikeIdentifier(synonymTerm);
|
|
992
1027
|
const variations = getCasNumberVariations(synonymTerm);
|
|
993
1028
|
const shouldClauses = [];
|
|
994
1029
|
for (const v of variations) {
|
|
@@ -1000,12 +1035,22 @@ class ChemicalsService {
|
|
|
1000
1035
|
{ term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
1001
1036
|
{ prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1002
1037
|
{ term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
1003
|
-
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
1004
|
-
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
1005
|
-
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
1006
|
-
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1007
|
-
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
1038
|
+
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
1008
1039
|
);
|
|
1040
|
+
// Skip tokenized-text matches for identifier-shaped queries to avoid
|
|
1041
|
+
// noise from the analyzer splitting "71-43-2" into 71/43/2 tokens.
|
|
1042
|
+
if (!isIdentifier) {
|
|
1043
|
+
shouldClauses.push(
|
|
1044
|
+
// Whitespace-analyzer fields (preserve hyphenated compound names).
|
|
1045
|
+
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
1046
|
+
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
1047
|
+
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1048
|
+
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
|
|
1049
|
+
// Word-level (standard analyzer) sub-fields — let "ethane" still match "trichloroethane".
|
|
1050
|
+
{ match: { 'synonyms.std': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1051
|
+
{ match: { 'chemical_name.std': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
1052
|
+
);
|
|
1053
|
+
}
|
|
1009
1054
|
}
|
|
1010
1055
|
|
|
1011
1056
|
const result = await this.connection.invokeOpenSearch({
|
|
@@ -1305,7 +1350,7 @@ class ChemicalsService {
|
|
|
1305
1350
|
search: async (params) => {
|
|
1306
1351
|
let searchTerm = '';
|
|
1307
1352
|
let limit = params.body?.size || 10;
|
|
1308
|
-
|
|
1353
|
+
|
|
1309
1354
|
const toLegacySource = (r) => this._toLegacyChemicalSource({
|
|
1310
1355
|
name: r.name,
|
|
1311
1356
|
cas: r.cas || [],
|
|
@@ -1313,11 +1358,21 @@ class ChemicalsService {
|
|
|
1313
1358
|
id: r.id
|
|
1314
1359
|
});
|
|
1315
1360
|
|
|
1361
|
+
// Callers may pass an Elasticsearch query_string with reserved chars escaped
|
|
1362
|
+
// (e.g. "71\-43\-2"). Strip the escapes plus a trailing wildcard so keyword
|
|
1363
|
+
// term/prefix clauses against cas_numbers / identifier_values can match.
|
|
1364
|
+
const normalizeQueryString = (raw) => {
|
|
1365
|
+
if (!raw) return '';
|
|
1366
|
+
let s = String(raw);
|
|
1367
|
+
if (s.endsWith('*')) s = s.slice(0, -1);
|
|
1368
|
+
return s.replace(/\\(.)/g, '$1');
|
|
1369
|
+
};
|
|
1370
|
+
|
|
1316
1371
|
if (params.index === 'synonym_lookup_index') {
|
|
1317
1372
|
const query = params.body?.query;
|
|
1318
1373
|
searchTerm = query?.match?.chemical_name ||
|
|
1319
1374
|
query?.term?.chemical_name ||
|
|
1320
|
-
query?.query_string?.query || '';
|
|
1375
|
+
normalizeQueryString(query?.query_string?.query) || '';
|
|
1321
1376
|
const searchResults = await this.searchBySynonym(searchTerm, limit);
|
|
1322
1377
|
|
|
1323
1378
|
return {
|
|
@@ -1347,7 +1402,7 @@ class ChemicalsService {
|
|
|
1347
1402
|
const query = params.body?.query;
|
|
1348
1403
|
searchTerm = query?.match?.chemical_name ||
|
|
1349
1404
|
query?.term?.chemical_name ||
|
|
1350
|
-
query?.query_string?.query || '';
|
|
1405
|
+
normalizeQueryString(query?.query_string?.query) || '';
|
|
1351
1406
|
const searchResults = await this.searchByName(searchTerm, limit);
|
|
1352
1407
|
|
|
1353
1408
|
return {
|
package/package.json
CHANGED