@toxplanet/pegasus-sdk 1.2.12 → 1.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/chemicals.js +55 -10
- package/package.json +1 -1
package/lib/chemicals.js
CHANGED
|
@@ -9,6 +9,25 @@ const SEARCH_BOOST_PREFIX_SECONDARY = 10;
|
|
|
9
9
|
|
|
10
10
|
const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'CID', 'DTXSID', 'EINECS', 'EC']);
|
|
11
11
|
|
|
12
|
+
// Heuristic: does the search term look like a chemical identifier (CAS, EC,
|
|
13
|
+
// InChIKey, CID, etc.) rather than a chemical name? If so, restrict to keyword
|
|
14
|
+
// matches so we don't get noise from tokenized text-field matches on names.
|
|
15
|
+
function looksLikeIdentifier(term) {
|
|
16
|
+
if (!term) return false;
|
|
17
|
+
const t = String(term).trim();
|
|
18
|
+
if (!t) return false;
|
|
19
|
+
// CAS / EC: digits with hyphen or slash separators
|
|
20
|
+
if (/^\d{1,7}[-/]\d{1,3}[-/]\d{1,2}$/.test(t)) return true; // CAS xxxxxxx-xx-x
|
|
21
|
+
if (/^\d{2,3}-\d{3}-\d$/.test(t)) return true; // EC xxx-xxx-x
|
|
22
|
+
// InChIKey: 14-10-1 uppercase letters with hyphens
|
|
23
|
+
if (/^[A-Z]{14}-[A-Z]{10}-[A-Z]$/.test(t)) return true;
|
|
24
|
+
// PubChem CID
|
|
25
|
+
if (/^CID\d+$/i.test(t)) return true;
|
|
26
|
+
// DTXSID
|
|
27
|
+
if (/^DTXSID\d+$/i.test(t)) return true;
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
|
|
12
31
|
// Equivalence groups for CDI identifier_key normalization. When any member of
|
|
13
32
|
// a group is present on a row, the others are backfilled with the same values
|
|
14
33
|
// so consumers can look up by their preferred spelling.
|
|
@@ -776,6 +795,10 @@ class ChemicalsService {
|
|
|
776
795
|
query: {
|
|
777
796
|
bool: {
|
|
778
797
|
should: [
|
|
798
|
+
// v2 mapping: cas_numbers / identifier_values are keyword type.
|
|
799
|
+
{ term: { 'cas_numbers': identifierValue } },
|
|
800
|
+
{ term: { 'identifier_values': identifierValue } },
|
|
801
|
+
// Back-compat with legacy mapping where these were dynamically text + .keyword.
|
|
779
802
|
{ term: { 'cas_numbers.keyword': identifierValue } },
|
|
780
803
|
{ term: { 'identifier_values.keyword': identifierValue } }
|
|
781
804
|
],
|
|
@@ -924,6 +947,7 @@ class ChemicalsService {
|
|
|
924
947
|
}
|
|
925
948
|
|
|
926
949
|
try {
|
|
950
|
+
const isIdentifier = looksLikeIdentifier(searchTerm);
|
|
927
951
|
const variations = getCasNumberVariations(searchTerm);
|
|
928
952
|
const shouldClauses = [];
|
|
929
953
|
for (const v of variations) {
|
|
@@ -935,12 +959,22 @@ class ChemicalsService {
|
|
|
935
959
|
{ term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
936
960
|
{ prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
937
961
|
{ term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
938
|
-
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
939
|
-
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
940
|
-
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
941
|
-
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
942
|
-
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
962
|
+
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
943
963
|
);
|
|
964
|
+
// Skip tokenized-text matches for identifier-shaped queries to avoid
|
|
965
|
+
// noise from the analyzer splitting "71-43-2" into 71/43/2 tokens.
|
|
966
|
+
if (!isIdentifier) {
|
|
967
|
+
shouldClauses.push(
|
|
968
|
+
// Whitespace-analyzer fields (preserve hyphenated compound names).
|
|
969
|
+
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
970
|
+
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
971
|
+
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
972
|
+
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
|
|
973
|
+
// Word-level (standard analyzer) sub-fields — let "ethane" still match "trichloroethane".
|
|
974
|
+
{ match: { 'chemical_name.std': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
975
|
+
{ match: { 'synonyms.std': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
976
|
+
);
|
|
977
|
+
}
|
|
944
978
|
}
|
|
945
979
|
|
|
946
980
|
const result = await this.connection.invokeOpenSearch({
|
|
@@ -989,6 +1023,7 @@ class ChemicalsService {
|
|
|
989
1023
|
}
|
|
990
1024
|
|
|
991
1025
|
try {
|
|
1026
|
+
const isIdentifier = looksLikeIdentifier(synonymTerm);
|
|
992
1027
|
const variations = getCasNumberVariations(synonymTerm);
|
|
993
1028
|
const shouldClauses = [];
|
|
994
1029
|
for (const v of variations) {
|
|
@@ -1000,12 +1035,22 @@ class ChemicalsService {
|
|
|
1000
1035
|
{ term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
1001
1036
|
{ prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1002
1037
|
{ term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
1003
|
-
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
1004
|
-
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
1005
|
-
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
1006
|
-
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1007
|
-
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
1038
|
+
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
1008
1039
|
);
|
|
1040
|
+
// Skip tokenized-text matches for identifier-shaped queries to avoid
|
|
1041
|
+
// noise from the analyzer splitting "71-43-2" into 71/43/2 tokens.
|
|
1042
|
+
if (!isIdentifier) {
|
|
1043
|
+
shouldClauses.push(
|
|
1044
|
+
// Whitespace-analyzer fields (preserve hyphenated compound names).
|
|
1045
|
+
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
1046
|
+
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
1047
|
+
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1048
|
+
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
|
|
1049
|
+
// Word-level (standard analyzer) sub-fields — let "ethane" still match "trichloroethane".
|
|
1050
|
+
{ match: { 'synonyms.std': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1051
|
+
{ match: { 'chemical_name.std': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
1052
|
+
);
|
|
1053
|
+
}
|
|
1009
1054
|
}
|
|
1010
1055
|
|
|
1011
1056
|
const result = await this.connection.invokeOpenSearch({
|
package/package.json
CHANGED