@toxplanet/pegasus-sdk 1.2.7 → 1.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/lib/chemicals.js +144 -27
  2. package/package.json +1 -1
package/lib/chemicals.js CHANGED
@@ -8,9 +8,15 @@ const SEARCH_BOOST_PREFIX_SECONDARY = 10;
8
8
 
9
9
  const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'CID', 'DTXSID', 'EINECS', 'EC']);
10
10
 
11
- const INCHIKEY_PATTERN = /^[A-Z0-9]{14}-[A-Z0-9]{8,10}-[A-Z0-9]$/;
12
- const EC_PATTERN = /^\d{3}-\d{3}-\d$/;
13
- const CID_PATTERN = /^CID/i;
11
+ // Equivalence groups for CDI identifier_key normalization. When any member of
12
+ // a group is present on a row, the others are backfilled with the same values
13
+ // so consumers can look up by their preferred spelling.
14
+ const IDENTIFIER_ALIAS_GROUPS = [
15
+ { keys: ['cid', 'pubchem_cid', 'pubchem'], names: { cid: 'CID', pubchem_cid: 'PubChem CID', pubchem: 'PubChem' } },
16
+ { keys: ['inchikey', 'inchi_key'], names: { inchikey: 'InChIKey', inchi_key: 'InChIKey' } },
17
+ { keys: ['ec', 'ec_number'], names: { ec: 'EC', ec_number: 'EC Number' } },
18
+ { keys: ['cas', 'cas_number', 'cas_rn'], names: { cas: 'CAS', cas_number: 'CAS Number', cas_rn: 'CAS RN' } }
19
+ ];
14
20
 
15
21
  function escapeLikePattern(value) {
16
22
  return value.replace(/[%_\\]/g, '\\$&');
@@ -65,18 +71,29 @@ function transformChemicalMeta(meta) {
65
71
 
66
72
  function transformChemicalIdentifiers(identifiers) {
67
73
  if (!identifiers || typeof identifiers !== 'object') return [];
74
+
68
75
  if (Array.isArray(identifiers)) {
69
- // If it's already in new format, return as-is
70
- if (identifiers.length > 0 && identifiers[0].type !== undefined) {
71
- return identifiers;
72
- }
73
- // Transform from old format { identifier_key, identifier_value, ... } to new format { type, value }
76
+ if (identifiers.length === 0) return [];
77
+ if (identifiers[0].type !== undefined) return identifiers;
78
+ // Legacy { identifier_key, identifier_value } row form
74
79
  return identifiers.map(item => ({
75
80
  type: item.identifier_key || item.type,
76
81
  value: Array.isArray(item.identifier_value) ? item.identifier_value[0] : (item.value || item.identifier_value)
77
82
  }));
78
83
  }
79
- return [];
84
+
85
+ // Legacy ES-doc form: { CAS: ["71-43-2"], CID: ["241"], InChIKey: "ABC..." }
86
+ // Flatten one entry per value, preserving multi-valued types.
87
+ const result = [];
88
+ for (const [type, value] of Object.entries(identifiers)) {
89
+ if (value == null) continue;
90
+ const values = Array.isArray(value) ? value : [value];
91
+ for (const v of values) {
92
+ if (v == null || v === '') continue;
93
+ result.push({ type, value: String(v) });
94
+ }
95
+ }
96
+ return result;
80
97
  }
81
98
 
82
99
  class ChemicalsService {
@@ -152,6 +169,85 @@ class ChemicalsService {
152
169
  });
153
170
  }
154
171
 
172
+ _expandIdentifierAliases(identifiers) {
173
+ const byKey = new Map(identifiers.map((i) => [i.identifier_key, i]));
174
+ const additions = [];
175
+ for (const group of IDENTIFIER_ALIAS_GROUPS) {
176
+ const present = group.keys.find((k) => byKey.has(k));
177
+ if (!present) continue;
178
+ const source = byKey.get(present);
179
+ for (const aliasKey of group.keys) {
180
+ if (byKey.has(aliasKey)) continue;
181
+ const alias = {
182
+ identifier_key: aliasKey,
183
+ identifier_name: group.names[aliasKey] || aliasKey,
184
+ identifier_value: [...source.identifier_value]
185
+ };
186
+ additions.push(alias);
187
+ byKey.set(aliasKey, alias);
188
+ }
189
+ }
190
+ return identifiers.concat(additions);
191
+ }
192
+
193
+ _chemicalRowToCDISource(chemical) {
194
+ if (!chemical) return null;
195
+
196
+ const identifierGroups = new Map();
197
+ const identifierList = Array.isArray(chemical.chemicalIdentifiers) ? chemical.chemicalIdentifiers : [];
198
+ for (const item of identifierList) {
199
+ if (!item || typeof item !== 'object') continue;
200
+ const type = item.type || item.identifier_key || '';
201
+ if (!type) continue;
202
+ const rawValue = item.value !== undefined ? item.value : item.identifier_value;
203
+ const values = Array.isArray(rawValue) ? rawValue : (rawValue != null ? [rawValue] : []);
204
+ if (!identifierGroups.has(type)) {
205
+ identifierGroups.set(type, {
206
+ identifier_key: String(type).toLowerCase(),
207
+ identifier_name: String(type),
208
+ identifier_value: []
209
+ });
210
+ }
211
+ const group = identifierGroups.get(type);
212
+ for (const v of values) {
213
+ if (v == null || v === '') continue;
214
+ group.identifier_value.push(String(v));
215
+ }
216
+ }
217
+ const identifiers = this._expandIdentifierAliases(Array.from(identifierGroups.values()));
218
+
219
+ const metaList = Array.isArray(chemical.chemicalMeta) ? chemical.chemicalMeta : [];
220
+ const meta = metaList.map((item) => {
221
+ const key = (item && (item.key || item.meta_key)) || '';
222
+ const rawValue = item && (item.value !== undefined ? item.value : item.meta_value_text);
223
+ const valueArr = Array.isArray(rawValue)
224
+ ? rawValue.map((v) => String(v))
225
+ : (rawValue != null ? [String(rawValue)] : []);
226
+ const out = {
227
+ meta_key: String(key).toLowerCase(),
228
+ meta_value_text: valueArr
229
+ };
230
+ const unit = item && (item.unit || item.meta_value_unit);
231
+ if (unit) out.meta_value_unit = unit;
232
+ return out;
233
+ });
234
+
235
+ const synonyms = Array.isArray(chemical.chemicalSynonyms) ? chemical.chemicalSynonyms : [];
236
+ const names = [chemical.chemicalName, ...synonyms].filter(Boolean);
237
+
238
+ return {
239
+ chemical_set_identifier: chemical.sourceId || '',
240
+ chemical_primary_name: chemical.chemicalName || '',
241
+ chemical_names: names,
242
+ chemical_synonyms: synonyms,
243
+ chemical_categories: chemical.chemicalCategories || [],
244
+ chemical_identifiers: identifiers,
245
+ chemical_meta: meta,
246
+ chemical_created_at: chemical.createdAt,
247
+ chemical_updated_at: chemical.updatedAt
248
+ };
249
+ }
250
+
155
251
  _mapChemicalRow(row) {
156
252
  if (!row) return null;
157
253
  return {
@@ -658,7 +754,7 @@ class ChemicalsService {
658
754
  }
659
755
 
660
756
  await this.connection.ensureConnected();
661
-
757
+
662
758
  const sql = `SELECT * FROM chemicals WHERE chemical_identifiers->>'${identifierType}' = :value OR chemical_identifiers->'${identifierType}' ? :value`;
663
759
  const params = [{ name: 'value', value: { stringValue: identifierValue } }];
664
760
  const result = await this.connection.query(sql, params);
@@ -669,6 +765,36 @@ class ChemicalsService {
669
765
  }
670
766
  }
671
767
 
768
+ async findChemicalByIdentifier(identifierValue) {
769
+ if (!identifierValue) return null;
770
+ const result = await this.connection.invokeOpenSearch({
771
+ operation: 'search',
772
+ body: {
773
+ size: 1,
774
+ query: {
775
+ bool: {
776
+ should: [
777
+ { term: { 'cas_numbers.keyword': identifierValue } },
778
+ { term: { 'identifier_values.keyword': identifierValue } }
779
+ ],
780
+ minimum_should_match: 1
781
+ }
782
+ },
783
+ _source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
784
+ }
785
+ });
786
+
787
+ const hit = result?.hits?.hits?.[0]?._source;
788
+ if (!hit) return null;
789
+ return {
790
+ id: hit.postgres_id,
791
+ name: hit.chemical_name,
792
+ cas: hit.cas_numbers || [],
793
+ identifiers: hit.identifier_values || [],
794
+ synonyms: hit.synonyms || []
795
+ };
796
+ }
797
+
672
798
  async countByCollection(collectionName) {
673
799
  try {
674
800
  await this.connection.ensureConnected();
@@ -1068,22 +1194,9 @@ class ChemicalsService {
1068
1194
  let chemical = await this.getChemicalBySourceId(id);
1069
1195
 
1070
1196
  if (!chemical) {
1071
- const casMatches = await this.getChemicalsByCAS(id);
1072
- if (casMatches.length > 0) chemical = casMatches[0];
1073
- }
1074
-
1075
- if (!chemical) {
1076
- let identifierType = null;
1077
- if (CID_PATTERN.test(id)) {
1078
- identifierType = 'CID';
1079
- } else if (INCHIKEY_PATTERN.test(id)) {
1080
- identifierType = 'InChIKey';
1081
- } else if (EC_PATTERN.test(id)) {
1082
- identifierType = 'EC';
1083
- }
1084
- if (identifierType) {
1085
- const matches = await this.getChemicalsByIdentifier(identifierType, id);
1086
- if (matches.length > 0) chemical = matches[0];
1197
+ const hit = await this.findChemicalByIdentifier(id);
1198
+ if (hit && hit.id) {
1199
+ chemical = await this.getChemicalById(hit.id);
1087
1200
  }
1088
1201
  }
1089
1202
 
@@ -1098,13 +1211,17 @@ class ChemicalsService {
1098
1211
  };
1099
1212
  }
1100
1213
 
1214
+ const source = (params.index === 'chemical_data_index')
1215
+ ? this._chemicalRowToCDISource(chemical)
1216
+ : this._chemicalRowToLegacySource(chemical);
1217
+
1101
1218
  return {
1102
1219
  body: {
1103
1220
  _index: params.index,
1104
1221
  _id: params.id,
1105
1222
  _version: 1,
1106
1223
  found: true,
1107
- _source: this._chemicalRowToLegacySource(chemical)
1224
+ _source: source
1108
1225
  },
1109
1226
  statusCode: 200
1110
1227
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@toxplanet/pegasus-sdk",
3
- "version": "1.2.7",
3
+ "version": "1.2.9",
4
4
  "description": "SDK for migrating chemical data to Pegasus PostgreSQL + OpenSearch architecture with Elasticsearch client compatibility",
5
5
  "main": "index.js",
6
6
  "type": "commonjs",