@toxplanet/pegasus-sdk 1.2.8 → 1.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/chemicals.js CHANGED
@@ -1,5 +1,6 @@
1
1
  const { logError, logInfo } = require('@toxplanet/tphelper/logging');
2
2
  const { SQSClient, SendMessageCommand } = require('@aws-sdk/client-sqs');
3
+ const { getCasNumberVariations } = require('./search');
3
4
 
4
5
  const SEARCH_BOOST_EXACT_PRIMARY = 100;
5
6
  const SEARCH_BOOST_PREFIX_PRIMARY = 50;
@@ -8,11 +9,22 @@ const SEARCH_BOOST_PREFIX_SECONDARY = 10;
8
9
 
9
10
  const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'CID', 'DTXSID', 'EINECS', 'EC']);
10
11
 
12
+ // Equivalence groups for CDI identifier_key normalization. When any member of
13
+ // a group is present on a row, the others are backfilled with the same values
14
+ // so consumers can look up by their preferred spelling.
15
+ const IDENTIFIER_ALIAS_GROUPS = [
16
+ { keys: ['cid', 'pubchem_cid', 'pubchem'], names: { cid: 'CID', pubchem_cid: 'PubChem CID', pubchem: 'PubChem' } },
17
+ { keys: ['inchikey', 'inchi_key'], names: { inchikey: 'InChIKey', inchi_key: 'InChIKey' } },
18
+ { keys: ['ec', 'ec_number'], names: { ec: 'EC', ec_number: 'EC Number' } },
19
+ { keys: ['cas', 'cas_number', 'cas_rn'], names: { cas: 'CAS', cas_number: 'CAS Number', cas_rn: 'CAS RN' } }
20
+ ];
21
+
11
22
  function escapeLikePattern(value) {
12
23
  return value.replace(/[%_\\]/g, '\\$&');
13
24
  }
14
25
 
15
26
  function parsePostgresArray(str) {
27
+ if (Array.isArray(str)) return str.map((v) => String(v));
16
28
  if (!str || str === '{}') return [];
17
29
  const trimmed = str.slice(1, -1);
18
30
  if (!trimmed) return [];
@@ -159,6 +171,85 @@ class ChemicalsService {
159
171
  });
160
172
  }
161
173
 
174
+ _expandIdentifierAliases(identifiers) {
175
+ const byKey = new Map(identifiers.map((i) => [i.identifier_key, i]));
176
+ const additions = [];
177
+ for (const group of IDENTIFIER_ALIAS_GROUPS) {
178
+ const present = group.keys.find((k) => byKey.has(k));
179
+ if (!present) continue;
180
+ const source = byKey.get(present);
181
+ for (const aliasKey of group.keys) {
182
+ if (byKey.has(aliasKey)) continue;
183
+ const alias = {
184
+ identifier_key: aliasKey,
185
+ identifier_name: group.names[aliasKey] || aliasKey,
186
+ identifier_value: [...source.identifier_value]
187
+ };
188
+ additions.push(alias);
189
+ byKey.set(aliasKey, alias);
190
+ }
191
+ }
192
+ return identifiers.concat(additions);
193
+ }
194
+
195
+ _chemicalRowToCDISource(chemical) {
196
+ if (!chemical) return null;
197
+
198
+ const identifierGroups = new Map();
199
+ const identifierList = Array.isArray(chemical.chemicalIdentifiers) ? chemical.chemicalIdentifiers : [];
200
+ for (const item of identifierList) {
201
+ if (!item || typeof item !== 'object') continue;
202
+ const type = item.type || item.identifier_key || '';
203
+ if (!type) continue;
204
+ const rawValue = item.value !== undefined ? item.value : item.identifier_value;
205
+ const values = Array.isArray(rawValue) ? rawValue : (rawValue != null ? [rawValue] : []);
206
+ if (!identifierGroups.has(type)) {
207
+ identifierGroups.set(type, {
208
+ identifier_key: String(type).toLowerCase(),
209
+ identifier_name: String(type),
210
+ identifier_value: []
211
+ });
212
+ }
213
+ const group = identifierGroups.get(type);
214
+ for (const v of values) {
215
+ if (v == null || v === '') continue;
216
+ group.identifier_value.push(String(v));
217
+ }
218
+ }
219
+ const identifiers = Array.from(identifierGroups.values());
220
+
221
+ const metaList = Array.isArray(chemical.chemicalMeta) ? chemical.chemicalMeta : [];
222
+ const meta = metaList.map((item) => {
223
+ const key = (item && (item.key || item.meta_key)) || '';
224
+ const rawValue = item && (item.value !== undefined ? item.value : item.meta_value_text);
225
+ const valueArr = Array.isArray(rawValue)
226
+ ? rawValue.map((v) => String(v))
227
+ : (rawValue != null ? [String(rawValue)] : []);
228
+ const unit = item && (item.unit || item.meta_value_unit);
229
+ if (unit && valueArr.length > 0) {
230
+ valueArr[valueArr.length - 1] = `${valueArr[valueArr.length - 1]} ${unit}`;
231
+ }
232
+ return {
233
+ meta_key: String(key).toLowerCase(),
234
+ meta_value_text: valueArr
235
+ };
236
+ });
237
+
238
+ const synonyms = Array.isArray(chemical.chemicalSynonyms) ? chemical.chemicalSynonyms : [];
239
+
240
+ return {
241
+ chemical_set_identifier: chemical.sourceId || '',
242
+ chemical_primary_name: chemical.chemicalName || '',
243
+ chemical_names: synonyms,
244
+ chemical_synonyms: synonyms,
245
+ chemical_categories: chemical.chemicalCategories || [],
246
+ chemical_identifiers: identifiers,
247
+ chemical_meta: meta,
248
+ chemical_created_at: chemical.createdAt,
249
+ chemical_updated_at: chemical.updatedAt
250
+ };
251
+ }
252
+
162
253
  _mapChemicalRow(row) {
163
254
  if (!row) return null;
164
255
  return {
@@ -833,22 +924,32 @@ class ChemicalsService {
833
924
  }
834
925
 
835
926
  try {
927
+ const variations = getCasNumberVariations(searchTerm);
928
+ const shouldClauses = [];
929
+ for (const v of variations) {
930
+ shouldClauses.push(
931
+ { term: { 'chemical_name.keyword': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
932
+ { prefix: { 'chemical_name.keyword': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
933
+ { term: { 'synonyms.keyword': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
934
+ { prefix: { 'synonyms.keyword': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
935
+ { term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
936
+ { prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
937
+ { term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
938
+ { prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
939
+ { match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
940
+ { match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
941
+ { match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
942
+ { match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
943
+ );
944
+ }
945
+
836
946
  const result = await this.connection.invokeOpenSearch({
837
947
  operation: 'search',
838
948
  body: {
839
949
  size: limit,
840
950
  query: {
841
951
  bool: {
842
- should: [
843
- { term: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
844
- { prefix: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
845
- { term: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
846
- { prefix: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
847
- { match: { 'chemical_name': { query: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
848
- { match: { 'synonyms': { query: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
849
- { match_phrase_prefix: { 'chemical_name': { query: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
850
- { match_phrase_prefix: { 'synonyms': { query: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
851
- ],
952
+ should: shouldClauses,
852
953
  minimum_should_match: 1
853
954
  }
854
955
  },
@@ -888,22 +989,32 @@ class ChemicalsService {
888
989
  }
889
990
 
890
991
  try {
992
+ const variations = getCasNumberVariations(synonymTerm);
993
+ const shouldClauses = [];
994
+ for (const v of variations) {
995
+ shouldClauses.push(
996
+ { term: { 'synonyms.keyword': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
997
+ { prefix: { 'synonyms.keyword': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
998
+ { term: { 'chemical_name.keyword': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
999
+ { prefix: { 'chemical_name.keyword': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
1000
+ { term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
1001
+ { prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
1002
+ { term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
1003
+ { prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
1004
+ { match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
1005
+ { match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
1006
+ { match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
1007
+ { match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
1008
+ );
1009
+ }
1010
+
891
1011
  const result = await this.connection.invokeOpenSearch({
892
1012
  operation: 'search',
893
1013
  body: {
894
1014
  size: limit,
895
1015
  query: {
896
1016
  bool: {
897
- should: [
898
- { term: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
899
- { prefix: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
900
- { term: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
901
- { prefix: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
902
- { match: { 'synonyms': { query: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
903
- { match: { 'chemical_name': { query: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
904
- { match_phrase_prefix: { 'synonyms': { query: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
905
- { match_phrase_prefix: { 'chemical_name': { query: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
906
- ],
1017
+ should: shouldClauses,
907
1018
  minimum_should_match: 1
908
1019
  }
909
1020
  },
@@ -1102,24 +1213,16 @@ class ChemicalsService {
1102
1213
 
1103
1214
  get: async (params) => {
1104
1215
  const id = params.id;
1105
- let source = null;
1216
+ let chemical = await this.getChemicalBySourceId(id);
1106
1217
 
1107
- const chemical = await this.getChemicalBySourceId(id);
1108
- if (chemical) {
1109
- source = this._chemicalRowToLegacySource(chemical);
1110
- } else {
1218
+ if (!chemical) {
1111
1219
  const hit = await this.findChemicalByIdentifier(id);
1112
- if (hit) {
1113
- source = this._toLegacyChemicalSource({
1114
- name: hit.name,
1115
- cas: hit.cas,
1116
- identifiers: hit.identifiers,
1117
- id: hit.id
1118
- });
1220
+ if (hit && hit.id) {
1221
+ chemical = await this.getChemicalById(hit.id);
1119
1222
  }
1120
1223
  }
1121
1224
 
1122
- if (!source) {
1225
+ if (!chemical) {
1123
1226
  return {
1124
1227
  body: {
1125
1228
  _index: params.index,
@@ -1130,6 +1233,10 @@ class ChemicalsService {
1130
1233
  };
1131
1234
  }
1132
1235
 
1236
+ const source = (params.index === 'chemical_data_index')
1237
+ ? this._chemicalRowToCDISource(chemical)
1238
+ : this._chemicalRowToLegacySource(chemical);
1239
+
1133
1240
  return {
1134
1241
  body: {
1135
1242
  _index: params.index,
package/lib/db/index.js CHANGED
@@ -4,6 +4,16 @@ function getFieldValue(field) {
4
4
  if ('longValue' in field) return field.longValue;
5
5
  if ('doubleValue' in field) return field.doubleValue;
6
6
  if ('booleanValue' in field) return field.booleanValue;
7
+ if ('blobValue' in field) return field.blobValue;
8
+ if ('arrayValue' in field && field.arrayValue) {
9
+ const av = field.arrayValue;
10
+ if (Array.isArray(av.stringValues)) return av.stringValues;
11
+ if (Array.isArray(av.longValues)) return av.longValues;
12
+ if (Array.isArray(av.doubleValues)) return av.doubleValues;
13
+ if (Array.isArray(av.booleanValues)) return av.booleanValues;
14
+ if (Array.isArray(av.arrayValues)) return av.arrayValues;
15
+ return [];
16
+ }
7
17
  return null;
8
18
  }
9
19
 
package/lib/search.js CHANGED
@@ -383,3 +383,4 @@ class SearchService {
383
383
  }
384
384
 
385
385
  module.exports = SearchService;
386
+ module.exports.getCasNumberVariations = getCasNumberVariations;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@toxplanet/pegasus-sdk",
3
- "version": "1.2.8",
3
+ "version": "1.2.11",
4
4
  "description": "SDK for migrating chemical data to Pegasus PostgreSQL + OpenSearch architecture with Elasticsearch client compatibility",
5
5
  "main": "index.js",
6
6
  "type": "commonjs",