@toxplanet/pegasus-sdk 1.2.8 → 1.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/chemicals.js +140 -33
- package/lib/db/index.js +10 -0
- package/lib/search.js +1 -0
- package/package.json +1 -1
package/lib/chemicals.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
const { logError, logInfo } = require('@toxplanet/tphelper/logging');
|
|
2
2
|
const { SQSClient, SendMessageCommand } = require('@aws-sdk/client-sqs');
|
|
3
|
+
const { getCasNumberVariations } = require('./search');
|
|
3
4
|
|
|
4
5
|
const SEARCH_BOOST_EXACT_PRIMARY = 100;
|
|
5
6
|
const SEARCH_BOOST_PREFIX_PRIMARY = 50;
|
|
@@ -8,11 +9,22 @@ const SEARCH_BOOST_PREFIX_SECONDARY = 10;
|
|
|
8
9
|
|
|
9
10
|
const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'CID', 'DTXSID', 'EINECS', 'EC']);
|
|
10
11
|
|
|
12
|
+
// Equivalence groups for CDI identifier_key normalization. When any member of
|
|
13
|
+
// a group is present on a row, the others are backfilled with the same values
|
|
14
|
+
// so consumers can look up by their preferred spelling.
|
|
15
|
+
const IDENTIFIER_ALIAS_GROUPS = [
|
|
16
|
+
{ keys: ['cid', 'pubchem_cid', 'pubchem'], names: { cid: 'CID', pubchem_cid: 'PubChem CID', pubchem: 'PubChem' } },
|
|
17
|
+
{ keys: ['inchikey', 'inchi_key'], names: { inchikey: 'InChIKey', inchi_key: 'InChIKey' } },
|
|
18
|
+
{ keys: ['ec', 'ec_number'], names: { ec: 'EC', ec_number: 'EC Number' } },
|
|
19
|
+
{ keys: ['cas', 'cas_number', 'cas_rn'], names: { cas: 'CAS', cas_number: 'CAS Number', cas_rn: 'CAS RN' } }
|
|
20
|
+
];
|
|
21
|
+
|
|
11
22
|
function escapeLikePattern(value) {
|
|
12
23
|
return value.replace(/[%_\\]/g, '\\$&');
|
|
13
24
|
}
|
|
14
25
|
|
|
15
26
|
function parsePostgresArray(str) {
|
|
27
|
+
if (Array.isArray(str)) return str.map((v) => String(v));
|
|
16
28
|
if (!str || str === '{}') return [];
|
|
17
29
|
const trimmed = str.slice(1, -1);
|
|
18
30
|
if (!trimmed) return [];
|
|
@@ -159,6 +171,85 @@ class ChemicalsService {
|
|
|
159
171
|
});
|
|
160
172
|
}
|
|
161
173
|
|
|
174
|
+
_expandIdentifierAliases(identifiers) {
|
|
175
|
+
const byKey = new Map(identifiers.map((i) => [i.identifier_key, i]));
|
|
176
|
+
const additions = [];
|
|
177
|
+
for (const group of IDENTIFIER_ALIAS_GROUPS) {
|
|
178
|
+
const present = group.keys.find((k) => byKey.has(k));
|
|
179
|
+
if (!present) continue;
|
|
180
|
+
const source = byKey.get(present);
|
|
181
|
+
for (const aliasKey of group.keys) {
|
|
182
|
+
if (byKey.has(aliasKey)) continue;
|
|
183
|
+
const alias = {
|
|
184
|
+
identifier_key: aliasKey,
|
|
185
|
+
identifier_name: group.names[aliasKey] || aliasKey,
|
|
186
|
+
identifier_value: [...source.identifier_value]
|
|
187
|
+
};
|
|
188
|
+
additions.push(alias);
|
|
189
|
+
byKey.set(aliasKey, alias);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
return identifiers.concat(additions);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
_chemicalRowToCDISource(chemical) {
|
|
196
|
+
if (!chemical) return null;
|
|
197
|
+
|
|
198
|
+
const identifierGroups = new Map();
|
|
199
|
+
const identifierList = Array.isArray(chemical.chemicalIdentifiers) ? chemical.chemicalIdentifiers : [];
|
|
200
|
+
for (const item of identifierList) {
|
|
201
|
+
if (!item || typeof item !== 'object') continue;
|
|
202
|
+
const type = item.type || item.identifier_key || '';
|
|
203
|
+
if (!type) continue;
|
|
204
|
+
const rawValue = item.value !== undefined ? item.value : item.identifier_value;
|
|
205
|
+
const values = Array.isArray(rawValue) ? rawValue : (rawValue != null ? [rawValue] : []);
|
|
206
|
+
if (!identifierGroups.has(type)) {
|
|
207
|
+
identifierGroups.set(type, {
|
|
208
|
+
identifier_key: String(type).toLowerCase(),
|
|
209
|
+
identifier_name: String(type),
|
|
210
|
+
identifier_value: []
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
const group = identifierGroups.get(type);
|
|
214
|
+
for (const v of values) {
|
|
215
|
+
if (v == null || v === '') continue;
|
|
216
|
+
group.identifier_value.push(String(v));
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
const identifiers = Array.from(identifierGroups.values());
|
|
220
|
+
|
|
221
|
+
const metaList = Array.isArray(chemical.chemicalMeta) ? chemical.chemicalMeta : [];
|
|
222
|
+
const meta = metaList.map((item) => {
|
|
223
|
+
const key = (item && (item.key || item.meta_key)) || '';
|
|
224
|
+
const rawValue = item && (item.value !== undefined ? item.value : item.meta_value_text);
|
|
225
|
+
const valueArr = Array.isArray(rawValue)
|
|
226
|
+
? rawValue.map((v) => String(v))
|
|
227
|
+
: (rawValue != null ? [String(rawValue)] : []);
|
|
228
|
+
const unit = item && (item.unit || item.meta_value_unit);
|
|
229
|
+
if (unit && valueArr.length > 0) {
|
|
230
|
+
valueArr[valueArr.length - 1] = `${valueArr[valueArr.length - 1]} ${unit}`;
|
|
231
|
+
}
|
|
232
|
+
return {
|
|
233
|
+
meta_key: String(key).toLowerCase(),
|
|
234
|
+
meta_value_text: valueArr
|
|
235
|
+
};
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
const synonyms = Array.isArray(chemical.chemicalSynonyms) ? chemical.chemicalSynonyms : [];
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
chemical_set_identifier: chemical.sourceId || '',
|
|
242
|
+
chemical_primary_name: chemical.chemicalName || '',
|
|
243
|
+
chemical_names: synonyms,
|
|
244
|
+
chemical_synonyms: synonyms,
|
|
245
|
+
chemical_categories: chemical.chemicalCategories || [],
|
|
246
|
+
chemical_identifiers: identifiers,
|
|
247
|
+
chemical_meta: meta,
|
|
248
|
+
chemical_created_at: chemical.createdAt,
|
|
249
|
+
chemical_updated_at: chemical.updatedAt
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
162
253
|
_mapChemicalRow(row) {
|
|
163
254
|
if (!row) return null;
|
|
164
255
|
return {
|
|
@@ -833,22 +924,32 @@ class ChemicalsService {
|
|
|
833
924
|
}
|
|
834
925
|
|
|
835
926
|
try {
|
|
927
|
+
const variations = getCasNumberVariations(searchTerm);
|
|
928
|
+
const shouldClauses = [];
|
|
929
|
+
for (const v of variations) {
|
|
930
|
+
shouldClauses.push(
|
|
931
|
+
{ term: { 'chemical_name.keyword': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
|
|
932
|
+
{ prefix: { 'chemical_name.keyword': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
|
|
933
|
+
{ term: { 'synonyms.keyword': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
|
|
934
|
+
{ prefix: { 'synonyms.keyword': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
|
|
935
|
+
{ term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
936
|
+
{ prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
937
|
+
{ term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
938
|
+
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
|
|
939
|
+
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
940
|
+
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
941
|
+
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
942
|
+
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
943
|
+
);
|
|
944
|
+
}
|
|
945
|
+
|
|
836
946
|
const result = await this.connection.invokeOpenSearch({
|
|
837
947
|
operation: 'search',
|
|
838
948
|
body: {
|
|
839
949
|
size: limit,
|
|
840
950
|
query: {
|
|
841
951
|
bool: {
|
|
842
|
-
should:
|
|
843
|
-
{ term: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
|
|
844
|
-
{ prefix: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
|
|
845
|
-
{ term: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
|
|
846
|
-
{ prefix: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
|
|
847
|
-
{ match: { 'chemical_name': { query: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
848
|
-
{ match: { 'synonyms': { query: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
849
|
-
{ match_phrase_prefix: { 'chemical_name': { query: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
850
|
-
{ match_phrase_prefix: { 'synonyms': { query: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
851
|
-
],
|
|
952
|
+
should: shouldClauses,
|
|
852
953
|
minimum_should_match: 1
|
|
853
954
|
}
|
|
854
955
|
},
|
|
@@ -888,22 +989,32 @@ class ChemicalsService {
|
|
|
888
989
|
}
|
|
889
990
|
|
|
890
991
|
try {
|
|
992
|
+
const variations = getCasNumberVariations(synonymTerm);
|
|
993
|
+
const shouldClauses = [];
|
|
994
|
+
for (const v of variations) {
|
|
995
|
+
shouldClauses.push(
|
|
996
|
+
{ term: { 'synonyms.keyword': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
|
|
997
|
+
{ prefix: { 'synonyms.keyword': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
|
|
998
|
+
{ term: { 'chemical_name.keyword': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
|
|
999
|
+
{ prefix: { 'chemical_name.keyword': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
|
|
1000
|
+
{ term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
1001
|
+
{ prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1002
|
+
{ term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
1003
|
+
{ prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
|
|
1004
|
+
{ match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
1005
|
+
{ match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
1006
|
+
{ match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
1007
|
+
{ match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
1008
|
+
);
|
|
1009
|
+
}
|
|
1010
|
+
|
|
891
1011
|
const result = await this.connection.invokeOpenSearch({
|
|
892
1012
|
operation: 'search',
|
|
893
1013
|
body: {
|
|
894
1014
|
size: limit,
|
|
895
1015
|
query: {
|
|
896
1016
|
bool: {
|
|
897
|
-
should:
|
|
898
|
-
{ term: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
|
|
899
|
-
{ prefix: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
|
|
900
|
-
{ term: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
|
|
901
|
-
{ prefix: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
|
|
902
|
-
{ match: { 'synonyms': { query: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
903
|
-
{ match: { 'chemical_name': { query: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
904
|
-
{ match_phrase_prefix: { 'synonyms': { query: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
905
|
-
{ match_phrase_prefix: { 'chemical_name': { query: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
906
|
-
],
|
|
1017
|
+
should: shouldClauses,
|
|
907
1018
|
minimum_should_match: 1
|
|
908
1019
|
}
|
|
909
1020
|
},
|
|
@@ -1102,24 +1213,16 @@ class ChemicalsService {
|
|
|
1102
1213
|
|
|
1103
1214
|
get: async (params) => {
|
|
1104
1215
|
const id = params.id;
|
|
1105
|
-
let
|
|
1216
|
+
let chemical = await this.getChemicalBySourceId(id);
|
|
1106
1217
|
|
|
1107
|
-
|
|
1108
|
-
if (chemical) {
|
|
1109
|
-
source = this._chemicalRowToLegacySource(chemical);
|
|
1110
|
-
} else {
|
|
1218
|
+
if (!chemical) {
|
|
1111
1219
|
const hit = await this.findChemicalByIdentifier(id);
|
|
1112
|
-
if (hit) {
|
|
1113
|
-
|
|
1114
|
-
name: hit.name,
|
|
1115
|
-
cas: hit.cas,
|
|
1116
|
-
identifiers: hit.identifiers,
|
|
1117
|
-
id: hit.id
|
|
1118
|
-
});
|
|
1220
|
+
if (hit && hit.id) {
|
|
1221
|
+
chemical = await this.getChemicalById(hit.id);
|
|
1119
1222
|
}
|
|
1120
1223
|
}
|
|
1121
1224
|
|
|
1122
|
-
if (!
|
|
1225
|
+
if (!chemical) {
|
|
1123
1226
|
return {
|
|
1124
1227
|
body: {
|
|
1125
1228
|
_index: params.index,
|
|
@@ -1130,6 +1233,10 @@ class ChemicalsService {
|
|
|
1130
1233
|
};
|
|
1131
1234
|
}
|
|
1132
1235
|
|
|
1236
|
+
const source = (params.index === 'chemical_data_index')
|
|
1237
|
+
? this._chemicalRowToCDISource(chemical)
|
|
1238
|
+
: this._chemicalRowToLegacySource(chemical);
|
|
1239
|
+
|
|
1133
1240
|
return {
|
|
1134
1241
|
body: {
|
|
1135
1242
|
_index: params.index,
|
package/lib/db/index.js
CHANGED
|
@@ -4,6 +4,16 @@ function getFieldValue(field) {
|
|
|
4
4
|
if ('longValue' in field) return field.longValue;
|
|
5
5
|
if ('doubleValue' in field) return field.doubleValue;
|
|
6
6
|
if ('booleanValue' in field) return field.booleanValue;
|
|
7
|
+
if ('blobValue' in field) return field.blobValue;
|
|
8
|
+
if ('arrayValue' in field && field.arrayValue) {
|
|
9
|
+
const av = field.arrayValue;
|
|
10
|
+
if (Array.isArray(av.stringValues)) return av.stringValues;
|
|
11
|
+
if (Array.isArray(av.longValues)) return av.longValues;
|
|
12
|
+
if (Array.isArray(av.doubleValues)) return av.doubleValues;
|
|
13
|
+
if (Array.isArray(av.booleanValues)) return av.booleanValues;
|
|
14
|
+
if (Array.isArray(av.arrayValues)) return av.arrayValues;
|
|
15
|
+
return [];
|
|
16
|
+
}
|
|
7
17
|
return null;
|
|
8
18
|
}
|
|
9
19
|
|
package/lib/search.js
CHANGED
package/package.json
CHANGED