@toxplanet/pegasus-sdk 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/environment.acc.js +4 -3
- package/config/environment.dev.js +18 -17
- package/config/environment.prod.js +5 -4
- package/config/environment.qa.js +4 -3
- package/lib/chemicals.js +1149 -1149
- package/lib/connection.js +217 -217
- package/lib/db/index.js +26 -26
- package/package.json +47 -47
package/lib/chemicals.js
CHANGED
|
@@ -1,1150 +1,1150 @@
|
|
|
1
|
-
const { logError, logInfo } = require('@toxplanet/tphelper/logging');
|
|
2
|
-
const { SQSClient, SendMessageCommand } = require('@aws-sdk/client-sqs');
|
|
3
|
-
|
|
4
|
-
const SEARCH_BOOST_EXACT_PRIMARY = 100;
|
|
5
|
-
const SEARCH_BOOST_PREFIX_PRIMARY = 50;
|
|
6
|
-
const SEARCH_BOOST_EXACT_SECONDARY = 30;
|
|
7
|
-
const SEARCH_BOOST_PREFIX_SECONDARY = 10;
|
|
8
|
-
|
|
9
|
-
const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'DTXSID', 'EINECS', 'EC']);
|
|
10
|
-
|
|
11
|
-
function escapeLikePattern(value) {
|
|
12
|
-
return value.replace(/[%_\\]/g, '\\$&');
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
function parsePostgresArray(str) {
|
|
16
|
-
if (!str || str === '{}') return [];
|
|
17
|
-
const trimmed = str.slice(1, -1);
|
|
18
|
-
if (!trimmed) return [];
|
|
19
|
-
const result = [];
|
|
20
|
-
let current = '';
|
|
21
|
-
let inQuotes = false;
|
|
22
|
-
for (let i = 0; i < trimmed.length; i++) {
|
|
23
|
-
const char = trimmed[i];
|
|
24
|
-
if (char === '"') {
|
|
25
|
-
if (inQuotes && trimmed[i + 1] === '"') {
|
|
26
|
-
current += '"';
|
|
27
|
-
i++;
|
|
28
|
-
} else {
|
|
29
|
-
inQuotes = !inQuotes;
|
|
30
|
-
}
|
|
31
|
-
} else if (char === ',' && !inQuotes) {
|
|
32
|
-
result.push(current);
|
|
33
|
-
current = '';
|
|
34
|
-
} else {
|
|
35
|
-
current += char;
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
if (current) result.push(current);
|
|
39
|
-
return result;
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
function transformChemicalMeta(meta) {
|
|
43
|
-
if (!meta || typeof meta !== 'object') return [];
|
|
44
|
-
if (Array.isArray(meta)) {
|
|
45
|
-
// If it's already in new format, return as-is
|
|
46
|
-
if (meta.length > 0 && meta[0].key !== undefined) {
|
|
47
|
-
return meta;
|
|
48
|
-
}
|
|
49
|
-
// Transform from old format { meta_key, meta_value_text, meta_value_type, ... } to new format { key, value, [unit] }
|
|
50
|
-
return meta.map(item => {
|
|
51
|
-
const transformed = {
|
|
52
|
-
key: item.meta_key || item.key,
|
|
53
|
-
value: item.meta_value_text || item.value || []
|
|
54
|
-
};
|
|
55
|
-
if (item.unit) transformed.unit = item.unit;
|
|
56
|
-
return transformed;
|
|
57
|
-
});
|
|
58
|
-
}
|
|
59
|
-
return [];
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
function transformChemicalIdentifiers(identifiers) {
|
|
63
|
-
if (!identifiers || typeof identifiers !== 'object') return [];
|
|
64
|
-
if (Array.isArray(identifiers)) {
|
|
65
|
-
// If it's already in new format, return as-is
|
|
66
|
-
if (identifiers.length > 0 && identifiers[0].type !== undefined) {
|
|
67
|
-
return identifiers;
|
|
68
|
-
}
|
|
69
|
-
// Transform from old format { identifier_key, identifier_value, ... } to new format { type, value }
|
|
70
|
-
return identifiers.map(item => ({
|
|
71
|
-
type: item.identifier_key || item.type,
|
|
72
|
-
value: Array.isArray(item.identifier_value) ? item.identifier_value[0] : (item.value || item.identifier_value)
|
|
73
|
-
}));
|
|
74
|
-
}
|
|
75
|
-
return [];
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
class ChemicalsService {
|
|
79
|
-
constructor(connection) {
|
|
80
|
-
this.connection = connection;
|
|
81
|
-
this.sqsClient = null;
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
_parsePostgresArray(str) {
|
|
85
|
-
return parsePostgresArray(str);
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
_toPostgresArray(arr) {
|
|
89
|
-
if (!Array.isArray(arr) || arr.length === 0) return '{}';
|
|
90
|
-
return '{' + arr.map(s => `"${String(s).replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`).join(',') + '}';
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
_serializeDate(d) {
|
|
94
|
-
return d instanceof Date ? d.toISOString() : (d || new Date().toISOString());
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
_mapChemicalRow(row) {
|
|
98
|
-
if (!row) return null;
|
|
99
|
-
return {
|
|
100
|
-
chemicalId: row.chemical_id,
|
|
101
|
-
sourceId: row.source_id,
|
|
102
|
-
chemicalName: row.chemical_name,
|
|
103
|
-
chemicalMeta: row.chemical_meta ? (Array.isArray(row.chemical_meta) ? row.chemical_meta : JSON.parse(row.chemical_meta)) : null,
|
|
104
|
-
chemicalIdentifiers: row.chemical_identifiers ? (Array.isArray(row.chemical_identifiers) ? row.chemical_identifiers : JSON.parse(row.chemical_identifiers)) : null,
|
|
105
|
-
chemicalSynonyms: this._parsePostgresArray(row.chemical_synonyms),
|
|
106
|
-
chemicalCategories: this._parsePostgresArray(row.chemical_categories),
|
|
107
|
-
createdAt: row.created_at,
|
|
108
|
-
updatedAt: row.updated_at,
|
|
109
|
-
importedAt: row.imported_at
|
|
110
|
-
};
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
async sendSqlWriteFailure({ sql, parameters, error, retryCount, failedAt }) {
|
|
114
|
-
try {
|
|
115
|
-
const region = process.env.AWS_REGION || this.connection.region;
|
|
116
|
-
const { awsAccountId, environment } = this.connection.config;
|
|
117
|
-
const defaultQueueUrl = awsAccountId
|
|
118
|
-
? `https://sqs.${region}.amazonaws.com/${awsAccountId}/cr-pegasus-failed-items-${environment}`
|
|
119
|
-
: null;
|
|
120
|
-
const queueUrl = process.env.SQS_FAILED_ITEMS_QUEUE || defaultQueueUrl;
|
|
121
|
-
|
|
122
|
-
if (!queueUrl) {
|
|
123
|
-
logError('pegasus-sdk', 'sendSqlWriteFailure', 'No SQS queue URL available: set SQS_FAILED_ITEMS_QUEUE or provide awsAccountId in config');
|
|
124
|
-
return false;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
logInfo('pegasus-sdk', `[sendSqlWriteFailure] Using queue: ${queueUrl}${process.env.SQS_FAILED_ITEMS_QUEUE ? ' (from env)' : ' (default)'}`);
|
|
128
|
-
|
|
129
|
-
if (!this.sqsClient) {
|
|
130
|
-
this.sqsClient = new SQSClient({ region });
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
const message = {
|
|
134
|
-
MessageType: 'SqlWriteFailure',
|
|
135
|
-
SourceService: this.connection.config.sourceService || 'pegasus-sdk',
|
|
136
|
-
Timestamp: (failedAt || new Date()).toISOString(),
|
|
137
|
-
Sql: sql,
|
|
138
|
-
Parameters: parameters,
|
|
139
|
-
OriginalError: error.message,
|
|
140
|
-
RetryCount: retryCount
|
|
141
|
-
};
|
|
142
|
-
|
|
143
|
-
const command = new SendMessageCommand({
|
|
144
|
-
QueueUrl: queueUrl,
|
|
145
|
-
MessageBody: JSON.stringify(message)
|
|
146
|
-
});
|
|
147
|
-
|
|
148
|
-
const response = await this.sqsClient.send(command);
|
|
149
|
-
logInfo('pegasus-sdk', `[sendSqlWriteFailure] SqlWriteFailure posted to SQS: MessageId=${response.MessageId}, RetryCount=${retryCount}`);
|
|
150
|
-
return true;
|
|
151
|
-
} catch (sqsError) {
|
|
152
|
-
logError('pegasus-sdk', 'sendSqlWriteFailure', 'Failed to post SqlWriteFailure to SQS', sqsError);
|
|
153
|
-
return false;
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
_buildChemicalUpsertSql(chemical) {
|
|
158
|
-
const upsertSql = [
|
|
159
|
-
'INSERT INTO chemicals (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)',
|
|
160
|
-
'VALUES (:source_id, :chemical_name, :chemical_meta::jsonb, :chemical_identifiers::jsonb, :chemical_synonyms::text[], :chemical_categories::text[], :created_at::timestamp, :updated_at::timestamp)',
|
|
161
|
-
'ON CONFLICT (source_id) DO UPDATE SET',
|
|
162
|
-
' chemical_name = EXCLUDED.chemical_name,',
|
|
163
|
-
' chemical_meta = EXCLUDED.chemical_meta,',
|
|
164
|
-
' chemical_identifiers = EXCLUDED.chemical_identifiers,',
|
|
165
|
-
' chemical_synonyms = EXCLUDED.chemical_synonyms,',
|
|
166
|
-
' chemical_categories = EXCLUDED.chemical_categories,',
|
|
167
|
-
' updated_at = EXCLUDED.updated_at',
|
|
168
|
-
'RETURNING chemical_id, source_id'
|
|
169
|
-
].join('\n');
|
|
170
|
-
|
|
171
|
-
const transformedMeta = transformChemicalMeta(chemical.chemicalMeta);
|
|
172
|
-
const transformedIdentifiers = transformChemicalIdentifiers(chemical.chemicalIdentifiers);
|
|
173
|
-
|
|
174
|
-
const parameters = [
|
|
175
|
-
{ name: 'source_id', value: { stringValue: chemical.sourceId } },
|
|
176
|
-
{ name: 'chemical_name', value: { stringValue: chemical.chemicalName } },
|
|
177
|
-
{ name: 'chemical_meta', value: { stringValue: JSON.stringify(transformedMeta) }, typeHint: 'JSON' },
|
|
178
|
-
{ name: 'chemical_identifiers', value: { stringValue: JSON.stringify(transformedIdentifiers) }, typeHint: 'JSON' },
|
|
179
|
-
{ name: 'chemical_synonyms', value: { stringValue: this._toPostgresArray(chemical.chemicalSynonyms ?? []) } },
|
|
180
|
-
{ name: 'chemical_categories', value: { stringValue: this._toPostgresArray(chemical.chemicalCategories ?? []) } },
|
|
181
|
-
{ name: 'created_at', value: { stringValue: this._serializeDate(chemical.createdAt) } },
|
|
182
|
-
{ name: 'updated_at', value: { stringValue: this._serializeDate(chemical.updatedAt) } }
|
|
183
|
-
];
|
|
184
|
-
|
|
185
|
-
return { sql: upsertSql, parameters };
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
async _executeChemicalUpsert(chemical) {
|
|
189
|
-
await this.connection.ensureConnected();
|
|
190
|
-
const { sql, parameters } = this._buildChemicalUpsertSql(chemical);
|
|
191
|
-
const queryResult = await this.connection.query(sql, parameters);
|
|
192
|
-
const row = queryResult.rows?.[0];
|
|
193
|
-
if (!row) return null;
|
|
194
|
-
return {
|
|
195
|
-
chemicalId: row.chemical_id,
|
|
196
|
-
sourceId: row.source_id
|
|
197
|
-
};
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
_buildDebugSql(chemical) {
|
|
201
|
-
const esc = (s) => `'${String(s ?? '').replace(/'/g, "''")}'`;
|
|
202
|
-
const escJson = (v) => `'${JSON.stringify(v ?? {}).replace(/'/g, "''")}'`;
|
|
203
|
-
const escArr = (arr) => {
|
|
204
|
-
if (!Array.isArray(arr) || arr.length === 0) return `ARRAY[]::text[]`;
|
|
205
|
-
return `ARRAY[${arr.map(s => esc(s)).join(', ')}]`;
|
|
206
|
-
};
|
|
207
|
-
const escDate = (d) => esc(d instanceof Date ? d.toISOString() : (d ?? new Date().toISOString()));
|
|
208
|
-
|
|
209
|
-
return [
|
|
210
|
-
`INSERT INTO chemicals`,
|
|
211
|
-
` (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)`,
|
|
212
|
-
`VALUES (`,
|
|
213
|
-
` ${esc(chemical.sourceId)},`,
|
|
214
|
-
` ${esc(chemical.chemicalName)},`,
|
|
215
|
-
` ${escJson(chemical.chemicalMeta)}::jsonb,`,
|
|
216
|
-
` ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
|
|
217
|
-
` ${escArr(chemical.chemicalSynonyms)},`,
|
|
218
|
-
` ${escArr(chemical.chemicalCategories)},`,
|
|
219
|
-
` ${escDate(chemical.createdAt)},`,
|
|
220
|
-
` ${escDate(chemical.updatedAt)}`,
|
|
221
|
-
`)`,
|
|
222
|
-
`ON CONFLICT (source_id) DO UPDATE SET`,
|
|
223
|
-
` chemical_name = ${esc(chemical.chemicalName)},`,
|
|
224
|
-
` chemical_meta = ${escJson(chemical.chemicalMeta)}::jsonb,`,
|
|
225
|
-
` chemical_identifiers = ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
|
|
226
|
-
` chemical_synonyms = ${escArr(chemical.chemicalSynonyms)},`,
|
|
227
|
-
` chemical_categories = ${escArr(chemical.chemicalCategories)},`,
|
|
228
|
-
` updated_at = NOW();`
|
|
229
|
-
].join('\n');
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
async bulkIndexFielded(documents) {
|
|
233
|
-
try {
|
|
234
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] Starting bulk index with ${documents?.length || 0} documents`);
|
|
235
|
-
|
|
236
|
-
if (!documents || documents.length === 0) {
|
|
237
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] No documents provided, returning empty result`);
|
|
238
|
-
return { indexed: 0, errors: [], results: [] };
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
const results = [];
|
|
242
|
-
const errors = [];
|
|
243
|
-
|
|
244
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] Database connection established`);
|
|
245
|
-
|
|
246
|
-
for (let i = 0; i < documents.length; i++) {
|
|
247
|
-
const doc = documents[i];
|
|
248
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] Processing document ${i}: source_id=${doc.source_id}, chemical_name=${doc.chemical_name}`);
|
|
249
|
-
|
|
250
|
-
const parseDate = (dateValue) => {
|
|
251
|
-
if (!dateValue) return new Date();
|
|
252
|
-
if (dateValue instanceof Date) return dateValue;
|
|
253
|
-
if (typeof dateValue === 'string') return new Date(dateValue);
|
|
254
|
-
return new Date();
|
|
255
|
-
};
|
|
256
|
-
|
|
257
|
-
const chemical = {
|
|
258
|
-
sourceId: doc.source_id || doc._id,
|
|
259
|
-
chemicalName: doc.chemical_name || doc.name,
|
|
260
|
-
chemicalMeta: doc.chemical_meta || {},
|
|
261
|
-
chemicalIdentifiers: doc.chemical_identifiers || {},
|
|
262
|
-
chemicalSynonyms: doc.chemical_synonyms || [],
|
|
263
|
-
chemicalCategories: doc.chemical_categories || [],
|
|
264
|
-
createdAt: parseDate(doc.created_at),
|
|
265
|
-
updatedAt: parseDate(doc.updated_at),
|
|
266
|
-
...(doc.imported_at && { importedAt: doc.imported_at }),
|
|
267
|
-
...(doc.chemical_id && { chemicalId: doc.chemical_id })
|
|
268
|
-
};
|
|
269
|
-
|
|
270
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] Prepared chemical object: sourceId=${chemical.sourceId}, chemicalName=${chemical.chemicalName}`);
|
|
271
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] DEBUG SQL for document ${i}:\n${this._buildDebugSql(chemical)}`);
|
|
272
|
-
|
|
273
|
-
const attemptUpsert = async () => {
|
|
274
|
-
const result = await this._executeChemicalUpsert(chemical);
|
|
275
|
-
return result ? [result] : [];
|
|
276
|
-
};
|
|
277
|
-
|
|
278
|
-
let lastError = null;
|
|
279
|
-
let retryCount = 0;
|
|
280
|
-
const failedAt = new Date();
|
|
281
|
-
|
|
282
|
-
try {
|
|
283
|
-
const [result] = await attemptUpsert();
|
|
284
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully: ${result?.chemicalId || 'no ID returned'}`);
|
|
285
|
-
results.push({ index: i, success: true, result });
|
|
286
|
-
continue;
|
|
287
|
-
} catch (firstErr) {
|
|
288
|
-
lastError = firstErr;
|
|
289
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} first attempt failed (${firstErr.message}), retrying once`);
|
|
290
|
-
|
|
291
|
-
try {
|
|
292
|
-
const [result] = await attemptUpsert();
|
|
293
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully on retry: ${result?.chemicalId || 'no ID returned'}`);
|
|
294
|
-
results.push({ index: i, success: true, result });
|
|
295
|
-
continue;
|
|
296
|
-
} catch (retryErr) {
|
|
297
|
-
lastError = retryErr;
|
|
298
|
-
retryCount = 1;
|
|
299
|
-
}
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} failed after ${retryCount} local retries (source_id=${chemical.sourceId})`, lastError);
|
|
303
|
-
|
|
304
|
-
const { sql: failureSql, parameters: failureParams } = this._buildChemicalUpsertSql(chemical);
|
|
305
|
-
const queued = await this.sendSqlWriteFailure({
|
|
306
|
-
sql: failureSql,
|
|
307
|
-
parameters: failureParams,
|
|
308
|
-
error: lastError,
|
|
309
|
-
retryCount,
|
|
310
|
-
failedAt
|
|
311
|
-
});
|
|
312
|
-
|
|
313
|
-
if (queued) {
|
|
314
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} (source_id=${chemical.sourceId}) queued for repair via SQS`);
|
|
315
|
-
} else {
|
|
316
|
-
logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} (source_id=${chemical.sourceId}) failed and could not be queued — data loss risk`, lastError);
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
results.push({ index: i, success: false, error: lastError.message, queued });
|
|
320
|
-
errors.push({ document: doc, error: lastError.message, queued });
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
const successCount = results.filter(r => r.success).length;
|
|
324
|
-
const queuedCount = results.filter(r => !r.success && r.queued).length;
|
|
325
|
-
logInfo('pegasus-sdk', `[bulkIndexFielded] Bulk index complete: ${successCount}/${documents.length} succeeded, ${queuedCount} queued for repair, ${errors.length - queuedCount} unhandled errors`);
|
|
326
|
-
|
|
327
|
-
return { indexed: successCount, errors, results };
|
|
328
|
-
} catch (error) {
|
|
329
|
-
logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFielded', error);
|
|
330
|
-
throw error;
|
|
331
|
-
}
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
async bulkIndexFulltext(documents) {
|
|
335
|
-
try {
|
|
336
|
-
return { acknowledged: true, count: documents?.length || 0 };
|
|
337
|
-
} catch (error) {
|
|
338
|
-
logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFulltext', error);
|
|
339
|
-
throw error;
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
async bulkIndexSubstances(substances) {
|
|
344
|
-
try {
|
|
345
|
-
const documents = substances.map(substance => ({
|
|
346
|
-
source_id: substance.substance_id || substance.id,
|
|
347
|
-
chemical_name: substance.name || substance.substance_name,
|
|
348
|
-
chemical_meta: substance.meta || {},
|
|
349
|
-
chemical_identifiers: substance.identifiers || {},
|
|
350
|
-
chemical_synonyms: substance.synonyms || [],
|
|
351
|
-
chemical_categories: substance.categories || substance.substance_types || [],
|
|
352
|
-
created_at: substance.created_at,
|
|
353
|
-
updated_at: substance.updated_at,
|
|
354
|
-
imported_at: substance.imported_at
|
|
355
|
-
}));
|
|
356
|
-
|
|
357
|
-
return await this.bulkIndexFielded(documents);
|
|
358
|
-
} catch (error) {
|
|
359
|
-
logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexSubstances', error);
|
|
360
|
-
throw error;
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
async createChemical(chemical) {
|
|
365
|
-
try {
|
|
366
|
-
await this.connection.ensureConnected();
|
|
367
|
-
|
|
368
|
-
const transformedMeta = transformChemicalMeta(chemical.chemical_meta);
|
|
369
|
-
const transformedIdentifiers = transformChemicalIdentifiers(chemical.chemical_identifiers);
|
|
370
|
-
|
|
371
|
-
const columns = ['source_id', 'chemical_name', 'chemical_meta', 'chemical_identifiers', 'chemical_synonyms', 'chemical_categories', 'created_at', 'updated_at'];
|
|
372
|
-
const values = [':source_id', ':chemical_name', ':chemical_meta::jsonb', ':chemical_identifiers::jsonb', ':chemical_synonyms::text[]', ':chemical_categories::text[]', ':created_at::timestamp', ':updated_at::timestamp'];
|
|
373
|
-
const params = [
|
|
374
|
-
{ name: 'source_id', value: { stringValue: chemical.source_id } },
|
|
375
|
-
{ name: 'chemical_name', value: { stringValue: chemical.chemical_name } },
|
|
376
|
-
{ name: 'chemical_meta', value: { stringValue: JSON.stringify(transformedMeta) }, typeHint: 'JSON' },
|
|
377
|
-
{ name: 'chemical_identifiers', value: { stringValue: JSON.stringify(transformedIdentifiers) }, typeHint: 'JSON' },
|
|
378
|
-
{ name: 'chemical_synonyms', value: { stringValue: this._toPostgresArray(chemical.chemical_synonyms || []) } },
|
|
379
|
-
{ name: 'chemical_categories', value: { stringValue: this._toPostgresArray(chemical.chemical_categories || []) } },
|
|
380
|
-
{ name: 'created_at', value: { stringValue: this._serializeDate(chemical.created_at || new Date()) } },
|
|
381
|
-
{ name: 'updated_at', value: { stringValue: this._serializeDate(chemical.updated_at || new Date()) } }
|
|
382
|
-
];
|
|
383
|
-
|
|
384
|
-
if (chemical.imported_at) {
|
|
385
|
-
columns.push('imported_at');
|
|
386
|
-
values.push(':imported_at::timestamp');
|
|
387
|
-
params.push({ name: 'imported_at', value: { stringValue: this._serializeDate(chemical.imported_at) } });
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
if (chemical.chemical_id) {
|
|
391
|
-
columns.push('chemical_id');
|
|
392
|
-
values.push(':chemical_id');
|
|
393
|
-
params.push({ name: 'chemical_id', value: { stringValue: chemical.chemical_id } });
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
const sql = `INSERT INTO chemicals (${columns.join(', ')}) VALUES (${values.join(', ')}) RETURNING *`;
|
|
397
|
-
const result = await this.connection.query(sql, params);
|
|
398
|
-
return this._mapChemicalRow(result.rows?.[0]);
|
|
399
|
-
} catch (error) {
|
|
400
|
-
logError('pegasus-sdk', 'ChemicalsService', 'createChemical', error);
|
|
401
|
-
throw error;
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
async updateChemical(chemicalId, updates) {
|
|
406
|
-
try {
|
|
407
|
-
await this.connection.ensureConnected();
|
|
408
|
-
|
|
409
|
-
const setClauses = [];
|
|
410
|
-
const params = [];
|
|
411
|
-
|
|
412
|
-
if (updates.chemical_name) {
|
|
413
|
-
setClauses.push('chemical_name = :chemical_name');
|
|
414
|
-
params.push({ name: 'chemical_name', value: { stringValue: updates.chemical_name } });
|
|
415
|
-
}
|
|
416
|
-
if (updates.chemical_meta) {
|
|
417
|
-
const transformedMeta = transformChemicalMeta(updates.chemical_meta);
|
|
418
|
-
setClauses.push('chemical_meta = :chemical_meta::jsonb');
|
|
419
|
-
params.push({ name: 'chemical_meta', value: { stringValue: JSON.stringify(transformedMeta) }, typeHint: 'JSON' });
|
|
420
|
-
}
|
|
421
|
-
if (updates.chemical_identifiers) {
|
|
422
|
-
const transformedIdentifiers = transformChemicalIdentifiers(updates.chemical_identifiers);
|
|
423
|
-
setClauses.push('chemical_identifiers = :chemical_identifiers::jsonb');
|
|
424
|
-
params.push({ name: 'chemical_identifiers', value: { stringValue: JSON.stringify(transformedIdentifiers) }, typeHint: 'JSON' });
|
|
425
|
-
}
|
|
426
|
-
if (updates.chemical_synonyms) {
|
|
427
|
-
setClauses.push('chemical_synonyms = :chemical_synonyms::text[]');
|
|
428
|
-
params.push({ name: 'chemical_synonyms', value: { stringValue: this._toPostgresArray(updates.chemical_synonyms) } });
|
|
429
|
-
}
|
|
430
|
-
if (updates.chemical_categories) {
|
|
431
|
-
setClauses.push('chemical_categories = :chemical_categories::text[]');
|
|
432
|
-
params.push({ name: 'chemical_categories', value: { stringValue: this._toPostgresArray(updates.chemical_categories) } });
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
setClauses.push('updated_at = :updated_at::timestamp');
|
|
436
|
-
params.push({ name: 'updated_at', value: { stringValue: this._serializeDate(new Date()) } });
|
|
437
|
-
|
|
438
|
-
params.push({ name: 'id', value: { stringValue: chemicalId } });
|
|
439
|
-
|
|
440
|
-
const sql = `UPDATE chemicals SET ${setClauses.join(', ')} WHERE chemical_id = :id::uuid RETURNING *`;
|
|
441
|
-
const result = await this.connection.query(sql, params);
|
|
442
|
-
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
443
|
-
} catch (error) {
|
|
444
|
-
logError('pegasus-sdk', 'ChemicalsService', 'updateChemical', error);
|
|
445
|
-
throw error;
|
|
446
|
-
}
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
async deleteChemical(chemicalId) {
|
|
450
|
-
try {
|
|
451
|
-
await this.connection.ensureConnected();
|
|
452
|
-
|
|
453
|
-
const sql = 'DELETE FROM chemicals WHERE chemical_id = :id::uuid RETURNING *';
|
|
454
|
-
const params = [{ name: 'id', value: { stringValue: chemicalId } }];
|
|
455
|
-
const result = await this.connection.query(sql, params);
|
|
456
|
-
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
457
|
-
} catch (error) {
|
|
458
|
-
logError('pegasus-sdk', 'ChemicalsService', 'deleteChemical', error);
|
|
459
|
-
throw error;
|
|
460
|
-
}
|
|
461
|
-
}
|
|
462
|
-
|
|
463
|
-
async deleteBySourceId(sourceId) {
|
|
464
|
-
try {
|
|
465
|
-
await this.connection.ensureConnected();
|
|
466
|
-
|
|
467
|
-
const sql = 'DELETE FROM chemicals WHERE source_id = :source_id RETURNING *';
|
|
468
|
-
const params = [{ name: 'source_id', value: { stringValue: sourceId } }];
|
|
469
|
-
const result = await this.connection.query(sql, params);
|
|
470
|
-
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
471
|
-
} catch (error) {
|
|
472
|
-
logError('pegasus-sdk', 'ChemicalsService', 'deleteBySourceId', error);
|
|
473
|
-
throw error;
|
|
474
|
-
}
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
async deleteCollection(collectionName) {
|
|
478
|
-
try {
|
|
479
|
-
await this.connection.ensureConnected();
|
|
480
|
-
|
|
481
|
-
const sql = 'DELETE FROM chemicals WHERE :collection_name = ANY(chemical_categories) RETURNING *';
|
|
482
|
-
const params = [{ name: 'collection_name', value: { stringValue: collectionName } }];
|
|
483
|
-
const result = await this.connection.query(sql, params);
|
|
484
|
-
const deleted = result.rows.map(row => this._mapChemicalRow(row));
|
|
485
|
-
return { deletedCount: deleted.length, deleted };
|
|
486
|
-
} catch (error) {
|
|
487
|
-
logError('pegasus-sdk', 'ChemicalsService', 'deleteCollection', error);
|
|
488
|
-
throw error;
|
|
489
|
-
}
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
async updateCollectionProperty(collectionName, propertyPath, newValue) {
|
|
493
|
-
try {
|
|
494
|
-
await this.connection.ensureConnected();
|
|
495
|
-
|
|
496
|
-
const pathArray = propertyPath.split('.');
|
|
497
|
-
const sql = 'UPDATE chemicals SET chemical_meta = jsonb_set(chemical_meta, :path::text[], :value::jsonb), updated_at = :updated_at::timestamp WHERE :collection_name = ANY(chemical_categories) RETURNING *';
|
|
498
|
-
const params = [
|
|
499
|
-
{ name: 'path', value: { stringValue: JSON.stringify(pathArray) }, typeHint: 'JSON' },
|
|
500
|
-
{ name: 'value', value: { stringValue: JSON.stringify(newValue) }, typeHint: 'JSON' },
|
|
501
|
-
{ name: 'updated_at', value: { stringValue: this._serializeDate(new Date()) } },
|
|
502
|
-
{ name: 'collection_name', value: { stringValue: collectionName } }
|
|
503
|
-
];
|
|
504
|
-
|
|
505
|
-
const result = await this.connection.query(sql, params);
|
|
506
|
-
const updated = result.rows.map(row => this._mapChemicalRow(row));
|
|
507
|
-
return { updatedCount: updated.length, updated };
|
|
508
|
-
} catch (error) {
|
|
509
|
-
logError('pegasus-sdk', 'ChemicalsService', 'updateCollectionProperty', error);
|
|
510
|
-
throw error;
|
|
511
|
-
}
|
|
512
|
-
}
|
|
513
|
-
|
|
514
|
-
async bulkUpdateProperty(filter, propertyPath, newValue) {
|
|
515
|
-
try {
|
|
516
|
-
await this.connection.ensureConnected();
|
|
517
|
-
|
|
518
|
-
let whereClause = '1=1';
|
|
519
|
-
const params = [];
|
|
520
|
-
|
|
521
|
-
if (filter.chemicalIds && filter.chemicalIds.length > 0) {
|
|
522
|
-
const ids = filter.chemicalIds.map((id, i) => `:cid_${i}`).join(',');
|
|
523
|
-
whereClause = `chemical_id = ANY(ARRAY[${ids}]::uuid[])`;
|
|
524
|
-
filter.chemicalIds.forEach((id, i) => {
|
|
525
|
-
params.push({ name: `cid_${i}`, value: { stringValue: id } });
|
|
526
|
-
});
|
|
527
|
-
} else if (filter.sourceIds && filter.sourceIds.length > 0) {
|
|
528
|
-
const ids = filter.sourceIds.map((id, i) => `:sid_${i}`).join(',');
|
|
529
|
-
whereClause = `source_id = ANY(ARRAY[${ids}]::text[])`;
|
|
530
|
-
filter.sourceIds.forEach((id, i) => {
|
|
531
|
-
params.push({ name: `sid_${i}`, value: { stringValue: id } });
|
|
532
|
-
});
|
|
533
|
-
} else if (filter.category) {
|
|
534
|
-
whereClause = ':category = ANY(chemical_categories)';
|
|
535
|
-
params.push({ name: 'category', value: { stringValue: filter.category } });
|
|
536
|
-
}
|
|
537
|
-
|
|
538
|
-
const pathArray = propertyPath.split('.');
|
|
539
|
-
const sql = `UPDATE chemicals SET chemical_meta = jsonb_set(COALESCE(chemical_meta, '{}'), :path::text[], :value::jsonb), updated_at = :updated_at::timestamp WHERE ${whereClause} RETURNING *`;
|
|
540
|
-
|
|
541
|
-
params.push({ name: 'path', value: { stringValue: JSON.stringify(pathArray) }, typeHint: 'JSON' });
|
|
542
|
-
params.push({ name: 'value', value: { stringValue: JSON.stringify(newValue) }, typeHint: 'JSON' });
|
|
543
|
-
params.push({ name: 'updated_at', value: { stringValue: this._serializeDate(new Date()) } });
|
|
544
|
-
|
|
545
|
-
const result = await this.connection.query(sql, params);
|
|
546
|
-
const updated = result.rows.map(row => this._mapChemicalRow(row));
|
|
547
|
-
return { updatedCount: updated.length, updated };
|
|
548
|
-
} catch (error) {
|
|
549
|
-
logError('pegasus-sdk', 'ChemicalsService', 'bulkUpdateProperty', error);
|
|
550
|
-
throw error;
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
async getChemicalById(chemicalId) {
|
|
555
|
-
try {
|
|
556
|
-
await this.connection.ensureConnected();
|
|
557
|
-
|
|
558
|
-
const sql = 'SELECT * FROM chemicals WHERE chemical_id = :id::uuid LIMIT 1';
|
|
559
|
-
const params = [{ name: 'id', value: { stringValue: chemicalId } }];
|
|
560
|
-
const result = await this.connection.query(sql, params);
|
|
561
|
-
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
562
|
-
} catch (error) {
|
|
563
|
-
logError('pegasus-sdk', 'ChemicalsService', 'getChemicalById', error);
|
|
564
|
-
throw error;
|
|
565
|
-
}
|
|
566
|
-
}
|
|
567
|
-
|
|
568
|
-
async getChemicalBySourceId(sourceId) {
|
|
569
|
-
try {
|
|
570
|
-
await this.connection.ensureConnected();
|
|
571
|
-
|
|
572
|
-
const sql = 'SELECT * FROM chemicals WHERE source_id = :source_id LIMIT 1';
|
|
573
|
-
const params = [{ name: 'source_id', value: { stringValue: sourceId } }];
|
|
574
|
-
const result = await this.connection.query(sql, params);
|
|
575
|
-
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
576
|
-
} catch (error) {
|
|
577
|
-
logError('pegasus-sdk', 'ChemicalsService', 'getChemicalBySourceId', error);
|
|
578
|
-
throw error;
|
|
579
|
-
}
|
|
580
|
-
}
|
|
581
|
-
|
|
582
|
-
async getChemicalsByCAS(casNumber) {
|
|
583
|
-
try {
|
|
584
|
-
await this.connection.ensureConnected();
|
|
585
|
-
|
|
586
|
-
const sql = "SELECT * FROM chemicals WHERE chemical_identifiers->>'CAS' = :cas OR chemical_identifiers->'CAS' ? :cas";
|
|
587
|
-
const params = [{ name: 'cas', value: { stringValue: casNumber } }];
|
|
588
|
-
const result = await this.connection.query(sql, params);
|
|
589
|
-
return result.rows.map(row => this._mapChemicalRow(row));
|
|
590
|
-
} catch (error) {
|
|
591
|
-
logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByCAS', error);
|
|
592
|
-
throw error;
|
|
593
|
-
}
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
async getChemicalsByIdentifier(identifierType, identifierValue) {
|
|
597
|
-
try {
|
|
598
|
-
if (!ALLOWED_IDENTIFIER_TYPES.has(identifierType)) {
|
|
599
|
-
throw new Error(`Invalid identifier type: ${identifierType}`);
|
|
600
|
-
}
|
|
601
|
-
|
|
602
|
-
await this.connection.ensureConnected();
|
|
603
|
-
|
|
604
|
-
const sql = `SELECT * FROM chemicals WHERE chemical_identifiers->>'${identifierType}' = :value OR chemical_identifiers->'${identifierType}' ? :value`;
|
|
605
|
-
const params = [{ name: 'value', value: { stringValue: identifierValue } }];
|
|
606
|
-
const result = await this.connection.query(sql, params);
|
|
607
|
-
return result.rows.map(row => this._mapChemicalRow(row));
|
|
608
|
-
} catch (error) {
|
|
609
|
-
logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByIdentifier', error);
|
|
610
|
-
throw error;
|
|
611
|
-
}
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
async countByCollection(collectionName) {
|
|
615
|
-
try {
|
|
616
|
-
await this.connection.ensureConnected();
|
|
617
|
-
|
|
618
|
-
const sql = 'SELECT count(*)::int AS count FROM chemicals WHERE :collection_name = ANY(chemical_categories)';
|
|
619
|
-
const params = [{ name: 'collection_name', value: { stringValue: collectionName } }];
|
|
620
|
-
const result = await this.connection.query(sql, params);
|
|
621
|
-
return { count: result.rows[0]?.count ?? 0 };
|
|
622
|
-
} catch (error) {
|
|
623
|
-
logError('pegasus-sdk', 'ChemicalsService', 'countByCollection', error);
|
|
624
|
-
throw error;
|
|
625
|
-
}
|
|
626
|
-
}
|
|
627
|
-
|
|
628
|
-
async countByIdentifier(identifierValue) {
|
|
629
|
-
try {
|
|
630
|
-
await this.connection.ensureConnected();
|
|
631
|
-
|
|
632
|
-
const searchPattern = `%${escapeLikePattern(identifierValue)}%`;
|
|
633
|
-
const sql = 'SELECT count(*)::int AS count FROM chemicals WHERE chemical_identifiers::text LIKE :pattern';
|
|
634
|
-
const params = [{ name: 'pattern', value: { stringValue: searchPattern } }];
|
|
635
|
-
const result = await this.connection.query(sql, params);
|
|
636
|
-
return { count: result.rows[0]?.count ?? 0 };
|
|
637
|
-
} catch (error) {
|
|
638
|
-
logError('pegasus-sdk', 'ChemicalsService', 'countByIdentifier', error);
|
|
639
|
-
throw error;
|
|
640
|
-
}
|
|
641
|
-
}
|
|
642
|
-
|
|
643
|
-
async countByCAS(casNumber) {
|
|
644
|
-
try {
|
|
645
|
-
await this.connection.ensureConnected();
|
|
646
|
-
|
|
647
|
-
const sql = "SELECT count(*)::int AS count FROM chemicals WHERE chemical_identifiers->>'CAS' = :cas OR chemical_identifiers->'CAS' ? :cas";
|
|
648
|
-
const params = [{ name: 'cas', value: { stringValue: casNumber } }];
|
|
649
|
-
const result = await this.connection.query(sql, params);
|
|
650
|
-
return { count: result.rows[0]?.count ?? 0 };
|
|
651
|
-
} catch (error) {
|
|
652
|
-
logError('pegasus-sdk', 'ChemicalsService', 'countByCAS', error);
|
|
653
|
-
throw error;
|
|
654
|
-
}
|
|
655
|
-
}
|
|
656
|
-
|
|
657
|
-
async getTotalSynonymCount() {
|
|
658
|
-
try {
|
|
659
|
-
await this.connection.ensureConnected();
|
|
660
|
-
|
|
661
|
-
const sql = 'SELECT sum(array_length(chemical_synonyms, 1))::int AS count FROM chemicals';
|
|
662
|
-
const result = await this.connection.query(sql, []);
|
|
663
|
-
return { count: result.rows[0]?.count || 0 };
|
|
664
|
-
} catch (error) {
|
|
665
|
-
logError('pegasus-sdk', 'ChemicalsService', 'getTotalSynonymCount', error);
|
|
666
|
-
throw error;
|
|
667
|
-
}
|
|
668
|
-
}
|
|
669
|
-
|
|
670
|
-
async getSynonymCount(synonymTerm) {
|
|
671
|
-
try {
|
|
672
|
-
await this.connection.ensureConnected();
|
|
673
|
-
|
|
674
|
-
const sql = 'SELECT count(*)::int AS count FROM chemicals WHERE :term = ANY(chemical_synonyms)';
|
|
675
|
-
const params = [{ name: 'term', value: { stringValue: synonymTerm } }];
|
|
676
|
-
const result = await this.connection.query(sql, params);
|
|
677
|
-
return { count: result.rows[0]?.count ?? 0 };
|
|
678
|
-
} catch (error) {
|
|
679
|
-
logError('pegasus-sdk', 'ChemicalsService', 'getSynonymCount', error);
|
|
680
|
-
throw error;
|
|
681
|
-
}
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
async convertIdentifier(fromIdentifier, toIdentifierType) {
|
|
685
|
-
try {
|
|
686
|
-
await this.connection.ensureConnected();
|
|
687
|
-
|
|
688
|
-
const searchPattern = `%${escapeLikePattern(fromIdentifier)}%`;
|
|
689
|
-
const sql = 'SELECT * FROM chemicals WHERE chemical_identifiers::text LIKE :pattern LIMIT 1';
|
|
690
|
-
const params = [{ name: 'pattern', value: { stringValue: searchPattern } }];
|
|
691
|
-
const result = await this.connection.query(sql, params);
|
|
692
|
-
|
|
693
|
-
if (result.rows.length === 0) {
|
|
694
|
-
return null;
|
|
695
|
-
}
|
|
696
|
-
|
|
697
|
-
const chemical = this._mapChemicalRow(result.rows[0]);
|
|
698
|
-
const identifiers = chemical.chemicalIdentifiers || {};
|
|
699
|
-
const toIdentifier = identifiers[toIdentifierType];
|
|
700
|
-
|
|
701
|
-
return {
|
|
702
|
-
fromIdentifier,
|
|
703
|
-
toIdentifierType,
|
|
704
|
-
toIdentifier,
|
|
705
|
-
chemicalId: chemical.chemicalId,
|
|
706
|
-
chemicalName: chemical.chemicalName
|
|
707
|
-
};
|
|
708
|
-
} catch (error) {
|
|
709
|
-
logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifier', error);
|
|
710
|
-
throw error;
|
|
711
|
-
}
|
|
712
|
-
}
|
|
713
|
-
|
|
714
|
-
async convertIdentifiersBatch(fromIdentifiers, toIdentifierType) {
|
|
715
|
-
try {
|
|
716
|
-
const conversions = await Promise.all(
|
|
717
|
-
fromIdentifiers.map(fromIdentifier =>
|
|
718
|
-
this.convertIdentifier(fromIdentifier, toIdentifierType)
|
|
719
|
-
)
|
|
720
|
-
);
|
|
721
|
-
|
|
722
|
-
return conversions.filter(conversion => conversion !== null);
|
|
723
|
-
} catch (error) {
|
|
724
|
-
logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifiersBatch', error);
|
|
725
|
-
throw error;
|
|
726
|
-
}
|
|
727
|
-
}
|
|
728
|
-
|
|
729
|
-
/**
|
|
730
|
-
* Search for chemicals by name using OpenSearch
|
|
731
|
-
* @param {string} searchTerm - Name to search for
|
|
732
|
-
* @param {number} limit - Maximum number of results (default: 10)
|
|
733
|
-
* @returns {Promise<Object>} Search results
|
|
734
|
-
*/
|
|
735
|
-
async searchByName(searchTerm, limit = 10) {
|
|
736
|
-
if (!searchTerm) {
|
|
737
|
-
return { results: [] };
|
|
738
|
-
}
|
|
739
|
-
|
|
740
|
-
try {
|
|
741
|
-
const result = await this.connection.invokeOpenSearch({
|
|
742
|
-
operation: 'search',
|
|
743
|
-
body: {
|
|
744
|
-
size: limit,
|
|
745
|
-
query: {
|
|
746
|
-
bool: {
|
|
747
|
-
should: [
|
|
748
|
-
{ term: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
|
|
749
|
-
{ prefix: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
|
|
750
|
-
{ term: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
|
|
751
|
-
{ prefix: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
|
|
752
|
-
{ match: { 'chemical_name': { query: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
753
|
-
{ match: { 'synonyms': { query: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
754
|
-
{ match_phrase_prefix: { 'chemical_name': { query: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
755
|
-
{ match_phrase_prefix: { 'synonyms': { query: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
756
|
-
],
|
|
757
|
-
minimum_should_match: 1
|
|
758
|
-
}
|
|
759
|
-
},
|
|
760
|
-
_source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
|
|
761
|
-
}
|
|
762
|
-
});
|
|
763
|
-
|
|
764
|
-
const hits = result?.hits?.hits || [];
|
|
765
|
-
const results = hits.map((hit) => ({
|
|
766
|
-
id: hit._source.postgres_id,
|
|
767
|
-
name: hit._source.chemical_name,
|
|
768
|
-
cas: hit._source.cas_numbers || [],
|
|
769
|
-
identifiers: hit._source.identifier_values || [],
|
|
770
|
-
synonyms: hit._source.synonyms || [],
|
|
771
|
-
score: hit._score
|
|
772
|
-
}));
|
|
773
|
-
|
|
774
|
-
return { results };
|
|
775
|
-
} catch (error) {
|
|
776
|
-
logError('pegasus-sdk', 'ChemicalsService', 'searchByName', error);
|
|
777
|
-
throw error;
|
|
778
|
-
}
|
|
779
|
-
}
|
|
780
|
-
|
|
781
|
-
/**
|
|
782
|
-
* Search for chemicals by synonym using OpenSearch
|
|
783
|
-
* @param {string} synonymTerm - Synonym to search for
|
|
784
|
-
* @param {number} limit - Maximum number of results (default: 10)
|
|
785
|
-
* @returns {Promise<Object>} Search results
|
|
786
|
-
*/
|
|
787
|
-
async searchBySynonym(synonymTerm, limit = 10) {
|
|
788
|
-
if (!synonymTerm) {
|
|
789
|
-
return { results: [] };
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
try {
|
|
793
|
-
const result = await this.connection.invokeOpenSearch({
|
|
794
|
-
operation: 'search',
|
|
795
|
-
body: {
|
|
796
|
-
size: limit,
|
|
797
|
-
query: {
|
|
798
|
-
bool: {
|
|
799
|
-
should: [
|
|
800
|
-
{ term: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
|
|
801
|
-
{ prefix: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
|
|
802
|
-
{ term: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
|
|
803
|
-
{ prefix: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
|
|
804
|
-
{ match: { 'synonyms': { query: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
805
|
-
{ match: { 'chemical_name': { query: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
806
|
-
{ match_phrase_prefix: { 'synonyms': { query: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
807
|
-
{ match_phrase_prefix: { 'chemical_name': { query: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
808
|
-
],
|
|
809
|
-
minimum_should_match: 1
|
|
810
|
-
}
|
|
811
|
-
},
|
|
812
|
-
_source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
|
|
813
|
-
}
|
|
814
|
-
});
|
|
815
|
-
|
|
816
|
-
const hits = result?.hits?.hits || [];
|
|
817
|
-
const results = hits.map((hit) => ({
|
|
818
|
-
id: hit._source.postgres_id,
|
|
819
|
-
name: hit._source.chemical_name,
|
|
820
|
-
cas: hit._source.cas_numbers || [],
|
|
821
|
-
identifiers: hit._source.identifier_values || [],
|
|
822
|
-
synonyms: hit._source.synonyms || [],
|
|
823
|
-
score: hit._score
|
|
824
|
-
}));
|
|
825
|
-
|
|
826
|
-
return { results };
|
|
827
|
-
} catch (error) {
|
|
828
|
-
logError('pegasus-sdk', 'ChemicalsService', 'searchBySynonym', error);
|
|
829
|
-
throw error;
|
|
830
|
-
}
|
|
831
|
-
}
|
|
832
|
-
|
|
833
|
-
async countAll() {
|
|
834
|
-
try {
|
|
835
|
-
await this.connection.ensureConnected();
|
|
836
|
-
const sql = 'SELECT count(*)::int AS count FROM chemicals';
|
|
837
|
-
const result = await this.connection.query(sql, []);
|
|
838
|
-
return { count: result.rows[0]?.count ?? 0 };
|
|
839
|
-
} catch (error) {
|
|
840
|
-
logError('pegasus-sdk', 'ChemicalsService', 'countAll', error);
|
|
841
|
-
throw error;
|
|
842
|
-
}
|
|
843
|
-
}
|
|
844
|
-
|
|
845
|
-
async findChemicalsWithoutDocuments(collectionName, searchTerm, pageSize = 100) {
|
|
846
|
-
try {
|
|
847
|
-
await this.connection.ensureConnected();
|
|
848
|
-
|
|
849
|
-
let whereFragments = [];
|
|
850
|
-
const params = [];
|
|
851
|
-
|
|
852
|
-
if (collectionName) {
|
|
853
|
-
whereFragments.push(':collection_name = ANY(chemical_categories)');
|
|
854
|
-
params.push({ name: 'collection_name', value: { stringValue: collectionName } });
|
|
855
|
-
}
|
|
856
|
-
|
|
857
|
-
if (searchTerm) {
|
|
858
|
-
const searchPattern = `%${escapeLikePattern(searchTerm)}%`;
|
|
859
|
-
whereFragments.push('chemical_name ILIKE :search_term');
|
|
860
|
-
params.push({ name: 'search_term', value: { stringValue: searchPattern } });
|
|
861
|
-
}
|
|
862
|
-
|
|
863
|
-
const whereClause = whereFragments.length > 0 ? 'WHERE ' + whereFragments.join(' AND ') : '';
|
|
864
|
-
params.push({ name: 'page_size', value: { longValue: pageSize } });
|
|
865
|
-
|
|
866
|
-
const sql = `SELECT * FROM chemicals ${whereClause} LIMIT :page_size`;
|
|
867
|
-
const result = await this.connection.query(sql, params);
|
|
868
|
-
return result.rows.map(row => this._mapChemicalRow(row));
|
|
869
|
-
} catch (error) {
|
|
870
|
-
logError('pegasus-sdk', 'ChemicalsService', 'findChemicalsWithoutDocuments', error);
|
|
871
|
-
throw error;
|
|
872
|
-
}
|
|
873
|
-
}
|
|
874
|
-
|
|
875
|
-
async countChemicalsWithoutDocuments(collectionName) {
|
|
876
|
-
try {
|
|
877
|
-
await this.connection.ensureConnected();
|
|
878
|
-
|
|
879
|
-
let sql = 'SELECT count(*)::int AS count FROM chemicals';
|
|
880
|
-
const params = [];
|
|
881
|
-
if (collectionName) {
|
|
882
|
-
sql += ' WHERE :collection_name = ANY(chemical_categories)';
|
|
883
|
-
params.push({ name: 'collection_name', value: { stringValue: collectionName } });
|
|
884
|
-
}
|
|
885
|
-
|
|
886
|
-
const result = await this.connection.query(sql, params);
|
|
887
|
-
return { count: result.rows[0]?.count ?? 0 };
|
|
888
|
-
} catch (error) {
|
|
889
|
-
logError('pegasus-sdk', 'ChemicalsService', 'countChemicalsWithoutDocuments', error);
|
|
890
|
-
throw error;
|
|
891
|
-
}
|
|
892
|
-
}
|
|
893
|
-
|
|
894
|
-
_buildEsHandlers() {
|
|
895
|
-
return {
|
|
896
|
-
index: async (params) => {
|
|
897
|
-
const chemical = params.body;
|
|
898
|
-
const result = await this.createChemical(chemical);
|
|
899
|
-
|
|
900
|
-
return {
|
|
901
|
-
_index: params.index,
|
|
902
|
-
_id: result.chemicalId,
|
|
903
|
-
_version: 1,
|
|
904
|
-
result: 'created',
|
|
905
|
-
_source: result
|
|
906
|
-
};
|
|
907
|
-
},
|
|
908
|
-
|
|
909
|
-
bulk: async (params) => {
|
|
910
|
-
const operations = params.body || params.operations;
|
|
911
|
-
|
|
912
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Starting bulk operation with ${operations?.length || 0} total operations`);
|
|
913
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Params index: ${params.index}`);
|
|
914
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Operations array type: ${Array.isArray(operations) ? 'array' : typeof operations}`);
|
|
915
|
-
|
|
916
|
-
const cdiDocuments = [];
|
|
917
|
-
let cdiOpCount = 0;
|
|
918
|
-
let otherOpCount = 0;
|
|
919
|
-
|
|
920
|
-
for (let i = 0; i < operations.length; i++) {
|
|
921
|
-
const op = operations[i];
|
|
922
|
-
const isIndexOp = !!(op.index || op.create);
|
|
923
|
-
const indexName = op.index?._index || op.create?._index || op.delete?._index || op.update?._index;
|
|
924
|
-
|
|
925
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Op[${i}]: action=${Object.keys(op)[0] || 'unknown'}, index=${indexName}`);
|
|
926
|
-
|
|
927
|
-
if ((op.index || op.create) &&
|
|
928
|
-
(op.index?._index === 'chemical_data_index' || op.create?._index === 'chemical_data_index')) {
|
|
929
|
-
const doc = operations[i + 1];
|
|
930
|
-
const sourceId = op.index?._id || op.create?._id;
|
|
931
|
-
|
|
932
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Found CDI entry: sourceId=${sourceId}, hasDoc=${!!doc}`);
|
|
933
|
-
|
|
934
|
-
if (doc && sourceId) {
|
|
935
|
-
const cdiDoc = {
|
|
936
|
-
source_id: sourceId,
|
|
937
|
-
chemical_name: doc.chemical_primary_name || (doc.chemical_names && doc.chemical_names[0]) || null,
|
|
938
|
-
chemical_meta: doc.chemical_meta || {},
|
|
939
|
-
chemical_identifiers: doc.chemical_identifiers || {},
|
|
940
|
-
chemical_synonyms: doc.chemical_synonyms || [],
|
|
941
|
-
chemical_categories: doc.chemical_categories || [],
|
|
942
|
-
created_at: doc.chemical_created_at ? (typeof doc.chemical_created_at === 'string' ? new Date(doc.chemical_created_at) : doc.chemical_created_at) : new Date(),
|
|
943
|
-
updated_at: doc.chemical_updated_at ? (typeof doc.chemical_updated_at === 'string' ? new Date(doc.chemical_updated_at) : doc.chemical_updated_at) : new Date()
|
|
944
|
-
};
|
|
945
|
-
cdiDocuments.push(cdiDoc);
|
|
946
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Extracted CDI doc: ${JSON.stringify({ source_id: cdiDoc.source_id, chemical_name: cdiDoc.chemical_name })}`);
|
|
947
|
-
i++;
|
|
948
|
-
cdiOpCount++;
|
|
949
|
-
} else {
|
|
950
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] CDI entry incomplete: sourceId=${sourceId}, doc=${!!doc}`);
|
|
951
|
-
}
|
|
952
|
-
} else {
|
|
953
|
-
otherOpCount++;
|
|
954
|
-
}
|
|
955
|
-
}
|
|
956
|
-
|
|
957
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Scan complete: ${cdiOpCount} CDI docs found, ${otherOpCount} other operations skipped`);
|
|
958
|
-
|
|
959
|
-
if (cdiDocuments.length === 0) {
|
|
960
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] No CDI documents to index, returning empty no-op response`);
|
|
961
|
-
return { took: 0, errors: false, items: [] };
|
|
962
|
-
}
|
|
963
|
-
|
|
964
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Calling bulkIndexFielded with ${cdiDocuments.length} CDI documents`);
|
|
965
|
-
|
|
966
|
-
try {
|
|
967
|
-
const result = await this.bulkIndexFielded(cdiDocuments);
|
|
968
|
-
logInfo('pegasus-sdk', `[ChemicalsService.bulk] bulkIndexFielded returned: indexed=${result.indexed}, errors=${result.errors.length}`);
|
|
969
|
-
|
|
970
|
-
if (result.errors.length > 0) {
|
|
971
|
-
logError('pegasus-sdk', 'ChemicalsService.bulk', 'Errors during bulk indexing', result.errors);
|
|
972
|
-
}
|
|
973
|
-
|
|
974
|
-
return {
|
|
975
|
-
took: 1,
|
|
976
|
-
errors: result.errors.length > 0,
|
|
977
|
-
items: result.results.map((res, idx) => ({
|
|
978
|
-
index: {
|
|
979
|
-
_index: 'chemical_data_index',
|
|
980
|
-
_id: cdiDocuments[idx].source_id,
|
|
981
|
-
status: res.success ? 200 : 400,
|
|
982
|
-
result: res.success ? 'created' : 'error',
|
|
983
|
-
...(res.success ? {} : { error: { type: 'mapper_parsing_exception', reason: res.error } })
|
|
984
|
-
}
|
|
985
|
-
}))
|
|
986
|
-
};
|
|
987
|
-
} catch (error) {
|
|
988
|
-
logError('pegasus-sdk', 'ChemicalsService.bulk', 'Fatal error during bulk indexing', error);
|
|
989
|
-
throw error;
|
|
990
|
-
}
|
|
991
|
-
},
|
|
992
|
-
|
|
993
|
-
get: async (params) => {
|
|
994
|
-
const result = await this.getChemicalBySourceId(params.id);
|
|
995
|
-
|
|
996
|
-
if (!result) {
|
|
997
|
-
return {
|
|
998
|
-
_index: params.index,
|
|
999
|
-
_id: params.id,
|
|
1000
|
-
found: false
|
|
1001
|
-
};
|
|
1002
|
-
}
|
|
1003
|
-
|
|
1004
|
-
return {
|
|
1005
|
-
_index: params.index,
|
|
1006
|
-
_id: params.id,
|
|
1007
|
-
_version: 1,
|
|
1008
|
-
found: true,
|
|
1009
|
-
_source: result
|
|
1010
|
-
};
|
|
1011
|
-
},
|
|
1012
|
-
|
|
1013
|
-
update: async (params) => {
|
|
1014
|
-
const result = await this.updateChemical(params.id, params.body);
|
|
1015
|
-
|
|
1016
|
-
return {
|
|
1017
|
-
_index: params.index,
|
|
1018
|
-
_id: params.id,
|
|
1019
|
-
_version: 2,
|
|
1020
|
-
result: result ? 'updated' : 'noop',
|
|
1021
|
-
_source: result
|
|
1022
|
-
};
|
|
1023
|
-
},
|
|
1024
|
-
|
|
1025
|
-
delete: async (params) => {
|
|
1026
|
-
if (params.index === 'synonym_lookup_index') {
|
|
1027
|
-
return { _index: params.index, _id: params.id, result: 'not_found' };
|
|
1028
|
-
}
|
|
1029
|
-
const result = await this.deleteBySourceId(params.id);
|
|
1030
|
-
|
|
1031
|
-
return {
|
|
1032
|
-
_index: params.index,
|
|
1033
|
-
_id: params.id,
|
|
1034
|
-
result: result ? 'deleted' : 'not_found'
|
|
1035
|
-
};
|
|
1036
|
-
},
|
|
1037
|
-
|
|
1038
|
-
deleteByQuery: async (params) => {
|
|
1039
|
-
const sourceId = params.body?.query?.term?.chemical_set_identifier
|
|
1040
|
-
|| params.body?.query?.term?.source_id;
|
|
1041
|
-
if (!sourceId) {
|
|
1042
|
-
return { deleted: 0, failures: [] };
|
|
1043
|
-
}
|
|
1044
|
-
const result = await this.deleteBySourceId(sourceId);
|
|
1045
|
-
return {
|
|
1046
|
-
deleted: result ? 1 : 0,
|
|
1047
|
-
failures: []
|
|
1048
|
-
};
|
|
1049
|
-
},
|
|
1050
|
-
|
|
1051
|
-
search: async (params) => {
|
|
1052
|
-
let searchTerm = '';
|
|
1053
|
-
let limit = params.body?.size || 10;
|
|
1054
|
-
|
|
1055
|
-
if (params.index === 'synonym_lookup_index') {
|
|
1056
|
-
const query = params.body?.query;
|
|
1057
|
-
searchTerm = query?.match?.chemical_name ||
|
|
1058
|
-
query?.term?.chemical_name ||
|
|
1059
|
-
query?.query_string?.query || '';
|
|
1060
|
-
const searchResults = await this.searchBySynonym(searchTerm, limit);
|
|
1061
|
-
|
|
1062
|
-
return {
|
|
1063
|
-
took: 1,
|
|
1064
|
-
timed_out: false,
|
|
1065
|
-
_shards: {
|
|
1066
|
-
total: 1,
|
|
1067
|
-
successful: 1,
|
|
1068
|
-
skipped: 0,
|
|
1069
|
-
failed: 0
|
|
1070
|
-
},
|
|
1071
|
-
hits: {
|
|
1072
|
-
total: {
|
|
1073
|
-
value: searchResults.results.length,
|
|
1074
|
-
relation: 'eq'
|
|
1075
|
-
},
|
|
1076
|
-
max_score: searchResults.results[0]?.score || 0,
|
|
1077
|
-
hits: searchResults.results.map(result => ({
|
|
1078
|
-
_index: params.index,
|
|
1079
|
-
_id: result.id,
|
|
1080
|
-
_score: result.score,
|
|
1081
|
-
_source: {
|
|
1082
|
-
postgres_id: result.id,
|
|
1083
|
-
chemical_name: result.name,
|
|
1084
|
-
cas_numbers: result.cas,
|
|
1085
|
-
identifier_values: result.identifiers,
|
|
1086
|
-
synonyms: result.synonyms
|
|
1087
|
-
}
|
|
1088
|
-
}))
|
|
1089
|
-
}
|
|
1090
|
-
};
|
|
1091
|
-
} else {
|
|
1092
|
-
const query = params.body?.query;
|
|
1093
|
-
searchTerm = query?.match?.chemical_name ||
|
|
1094
|
-
query?.term?.chemical_name ||
|
|
1095
|
-
query?.query_string?.query || '';
|
|
1096
|
-
const searchResults = await this.searchByName(searchTerm, limit);
|
|
1097
|
-
|
|
1098
|
-
return {
|
|
1099
|
-
took: 1,
|
|
1100
|
-
timed_out: false,
|
|
1101
|
-
_shards: {
|
|
1102
|
-
total: 1,
|
|
1103
|
-
successful: 1,
|
|
1104
|
-
skipped: 0,
|
|
1105
|
-
failed: 0
|
|
1106
|
-
},
|
|
1107
|
-
hits: {
|
|
1108
|
-
total: {
|
|
1109
|
-
value: searchResults.results.length,
|
|
1110
|
-
relation: 'eq'
|
|
1111
|
-
},
|
|
1112
|
-
max_score: searchResults.results[0]?.score || 0,
|
|
1113
|
-
hits: searchResults.results.map(result => ({
|
|
1114
|
-
_index: params.index,
|
|
1115
|
-
_id: result.id,
|
|
1116
|
-
_score: result.score,
|
|
1117
|
-
_source: {
|
|
1118
|
-
postgres_id: result.id,
|
|
1119
|
-
chemical_name: result.name,
|
|
1120
|
-
cas_numbers: result.cas,
|
|
1121
|
-
identifier_values: result.identifiers,
|
|
1122
|
-
synonyms: result.synonyms
|
|
1123
|
-
}
|
|
1124
|
-
}))
|
|
1125
|
-
}
|
|
1126
|
-
};
|
|
1127
|
-
}
|
|
1128
|
-
},
|
|
1129
|
-
|
|
1130
|
-
count: async (params) => {
|
|
1131
|
-
if (params.index === 'synonym_lookup_index') {
|
|
1132
|
-
return await this.getTotalSynonymCount();
|
|
1133
|
-
}
|
|
1134
|
-
return await this.countAll();
|
|
1135
|
-
}
|
|
1136
|
-
};
|
|
1137
|
-
}
|
|
1138
|
-
|
|
1139
|
-
registerElasticsearchHandlers(elasticsearchService) {
|
|
1140
|
-
const configurablePatterns = this.connection.config.indexRoutes?.chemicals || ['chemicals*'];
|
|
1141
|
-
const legacyPatterns = ['synonym_lookup_index', 'chemical_data_index', 'chemical_converter_index'];
|
|
1142
|
-
const allPatterns = [...new Set([...configurablePatterns, ...legacyPatterns])];
|
|
1143
|
-
const handlers = this._buildEsHandlers();
|
|
1144
|
-
allPatterns.forEach(pattern => {
|
|
1145
|
-
elasticsearchService.registerIndexRoute(pattern, handlers);
|
|
1146
|
-
});
|
|
1147
|
-
}
|
|
1148
|
-
}
|
|
1149
|
-
|
|
1
|
+
const { logError, logInfo } = require('@toxplanet/tphelper/logging');
|
|
2
|
+
const { SQSClient, SendMessageCommand } = require('@aws-sdk/client-sqs');
|
|
3
|
+
|
|
4
|
+
const SEARCH_BOOST_EXACT_PRIMARY = 100;
|
|
5
|
+
const SEARCH_BOOST_PREFIX_PRIMARY = 50;
|
|
6
|
+
const SEARCH_BOOST_EXACT_SECONDARY = 30;
|
|
7
|
+
const SEARCH_BOOST_PREFIX_SECONDARY = 10;
|
|
8
|
+
|
|
9
|
+
const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'DTXSID', 'EINECS', 'EC']);
|
|
10
|
+
|
|
11
|
+
function escapeLikePattern(value) {
|
|
12
|
+
return value.replace(/[%_\\]/g, '\\$&');
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function parsePostgresArray(str) {
|
|
16
|
+
if (!str || str === '{}') return [];
|
|
17
|
+
const trimmed = str.slice(1, -1);
|
|
18
|
+
if (!trimmed) return [];
|
|
19
|
+
const result = [];
|
|
20
|
+
let current = '';
|
|
21
|
+
let inQuotes = false;
|
|
22
|
+
for (let i = 0; i < trimmed.length; i++) {
|
|
23
|
+
const char = trimmed[i];
|
|
24
|
+
if (char === '"') {
|
|
25
|
+
if (inQuotes && trimmed[i + 1] === '"') {
|
|
26
|
+
current += '"';
|
|
27
|
+
i++;
|
|
28
|
+
} else {
|
|
29
|
+
inQuotes = !inQuotes;
|
|
30
|
+
}
|
|
31
|
+
} else if (char === ',' && !inQuotes) {
|
|
32
|
+
result.push(current);
|
|
33
|
+
current = '';
|
|
34
|
+
} else {
|
|
35
|
+
current += char;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
if (current) result.push(current);
|
|
39
|
+
return result;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function transformChemicalMeta(meta) {
|
|
43
|
+
if (!meta || typeof meta !== 'object') return [];
|
|
44
|
+
if (Array.isArray(meta)) {
|
|
45
|
+
// If it's already in new format, return as-is
|
|
46
|
+
if (meta.length > 0 && meta[0].key !== undefined) {
|
|
47
|
+
return meta;
|
|
48
|
+
}
|
|
49
|
+
// Transform from old format { meta_key, meta_value_text, meta_value_type, ... } to new format { key, value, [unit] }
|
|
50
|
+
return meta.map(item => {
|
|
51
|
+
const transformed = {
|
|
52
|
+
key: item.meta_key || item.key,
|
|
53
|
+
value: item.meta_value_text || item.value || []
|
|
54
|
+
};
|
|
55
|
+
if (item.unit) transformed.unit = item.unit;
|
|
56
|
+
return transformed;
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
return [];
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function transformChemicalIdentifiers(identifiers) {
|
|
63
|
+
if (!identifiers || typeof identifiers !== 'object') return [];
|
|
64
|
+
if (Array.isArray(identifiers)) {
|
|
65
|
+
// If it's already in new format, return as-is
|
|
66
|
+
if (identifiers.length > 0 && identifiers[0].type !== undefined) {
|
|
67
|
+
return identifiers;
|
|
68
|
+
}
|
|
69
|
+
// Transform from old format { identifier_key, identifier_value, ... } to new format { type, value }
|
|
70
|
+
return identifiers.map(item => ({
|
|
71
|
+
type: item.identifier_key || item.type,
|
|
72
|
+
value: Array.isArray(item.identifier_value) ? item.identifier_value[0] : (item.value || item.identifier_value)
|
|
73
|
+
}));
|
|
74
|
+
}
|
|
75
|
+
return [];
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
class ChemicalsService {
|
|
79
|
+
constructor(connection) {
|
|
80
|
+
this.connection = connection;
|
|
81
|
+
this.sqsClient = null;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
_parsePostgresArray(str) {
|
|
85
|
+
return parsePostgresArray(str);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
_toPostgresArray(arr) {
|
|
89
|
+
if (!Array.isArray(arr) || arr.length === 0) return '{}';
|
|
90
|
+
return '{' + arr.map(s => `"${String(s).replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`).join(',') + '}';
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
_serializeDate(d) {
|
|
94
|
+
return d instanceof Date ? d.toISOString() : (d || new Date().toISOString());
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
_mapChemicalRow(row) {
|
|
98
|
+
if (!row) return null;
|
|
99
|
+
return {
|
|
100
|
+
chemicalId: row.chemical_id,
|
|
101
|
+
sourceId: row.source_id,
|
|
102
|
+
chemicalName: row.chemical_name,
|
|
103
|
+
chemicalMeta: row.chemical_meta ? (Array.isArray(row.chemical_meta) ? row.chemical_meta : JSON.parse(row.chemical_meta)) : null,
|
|
104
|
+
chemicalIdentifiers: row.chemical_identifiers ? (Array.isArray(row.chemical_identifiers) ? row.chemical_identifiers : JSON.parse(row.chemical_identifiers)) : null,
|
|
105
|
+
chemicalSynonyms: this._parsePostgresArray(row.chemical_synonyms),
|
|
106
|
+
chemicalCategories: this._parsePostgresArray(row.chemical_categories),
|
|
107
|
+
createdAt: row.created_at,
|
|
108
|
+
updatedAt: row.updated_at,
|
|
109
|
+
importedAt: row.imported_at
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
async sendSqlWriteFailure({ sql, parameters, error, retryCount, failedAt }) {
|
|
114
|
+
try {
|
|
115
|
+
const region = process.env.AWS_REGION || this.connection.region;
|
|
116
|
+
const { awsAccountId, environment } = this.connection.config;
|
|
117
|
+
const defaultQueueUrl = awsAccountId
|
|
118
|
+
? `https://sqs.${region}.amazonaws.com/${awsAccountId}/cr-pegasus-failed-items-${environment}`
|
|
119
|
+
: null;
|
|
120
|
+
const queueUrl = process.env.SQS_FAILED_ITEMS_QUEUE || defaultQueueUrl;
|
|
121
|
+
|
|
122
|
+
if (!queueUrl) {
|
|
123
|
+
logError('pegasus-sdk', 'sendSqlWriteFailure', 'No SQS queue URL available: set SQS_FAILED_ITEMS_QUEUE or provide awsAccountId in config');
|
|
124
|
+
return false;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
logInfo('pegasus-sdk', `[sendSqlWriteFailure] Using queue: ${queueUrl}${process.env.SQS_FAILED_ITEMS_QUEUE ? ' (from env)' : ' (default)'}`);
|
|
128
|
+
|
|
129
|
+
if (!this.sqsClient) {
|
|
130
|
+
this.sqsClient = new SQSClient({ region });
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const message = {
|
|
134
|
+
MessageType: 'SqlWriteFailure',
|
|
135
|
+
SourceService: this.connection.config.sourceService || 'pegasus-sdk',
|
|
136
|
+
Timestamp: (failedAt || new Date()).toISOString(),
|
|
137
|
+
Sql: sql,
|
|
138
|
+
Parameters: parameters,
|
|
139
|
+
OriginalError: error.message,
|
|
140
|
+
RetryCount: retryCount
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
const command = new SendMessageCommand({
|
|
144
|
+
QueueUrl: queueUrl,
|
|
145
|
+
MessageBody: JSON.stringify(message)
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
const response = await this.sqsClient.send(command);
|
|
149
|
+
logInfo('pegasus-sdk', `[sendSqlWriteFailure] SqlWriteFailure posted to SQS: MessageId=${response.MessageId}, RetryCount=${retryCount}`);
|
|
150
|
+
return true;
|
|
151
|
+
} catch (sqsError) {
|
|
152
|
+
logError('pegasus-sdk', 'sendSqlWriteFailure', 'Failed to post SqlWriteFailure to SQS', sqsError);
|
|
153
|
+
return false;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
_buildChemicalUpsertSql(chemical) {
|
|
158
|
+
const upsertSql = [
|
|
159
|
+
'INSERT INTO chemicals (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)',
|
|
160
|
+
'VALUES (:source_id, :chemical_name, :chemical_meta::jsonb, :chemical_identifiers::jsonb, :chemical_synonyms::text[], :chemical_categories::text[], :created_at::timestamp, :updated_at::timestamp)',
|
|
161
|
+
'ON CONFLICT (source_id) DO UPDATE SET',
|
|
162
|
+
' chemical_name = EXCLUDED.chemical_name,',
|
|
163
|
+
' chemical_meta = EXCLUDED.chemical_meta,',
|
|
164
|
+
' chemical_identifiers = EXCLUDED.chemical_identifiers,',
|
|
165
|
+
' chemical_synonyms = EXCLUDED.chemical_synonyms,',
|
|
166
|
+
' chemical_categories = EXCLUDED.chemical_categories,',
|
|
167
|
+
' updated_at = EXCLUDED.updated_at',
|
|
168
|
+
'RETURNING chemical_id, source_id'
|
|
169
|
+
].join('\n');
|
|
170
|
+
|
|
171
|
+
const transformedMeta = transformChemicalMeta(chemical.chemicalMeta);
|
|
172
|
+
const transformedIdentifiers = transformChemicalIdentifiers(chemical.chemicalIdentifiers);
|
|
173
|
+
|
|
174
|
+
const parameters = [
|
|
175
|
+
{ name: 'source_id', value: { stringValue: chemical.sourceId } },
|
|
176
|
+
{ name: 'chemical_name', value: { stringValue: chemical.chemicalName } },
|
|
177
|
+
{ name: 'chemical_meta', value: { stringValue: JSON.stringify(transformedMeta) }, typeHint: 'JSON' },
|
|
178
|
+
{ name: 'chemical_identifiers', value: { stringValue: JSON.stringify(transformedIdentifiers) }, typeHint: 'JSON' },
|
|
179
|
+
{ name: 'chemical_synonyms', value: { stringValue: this._toPostgresArray(chemical.chemicalSynonyms ?? []) } },
|
|
180
|
+
{ name: 'chemical_categories', value: { stringValue: this._toPostgresArray(chemical.chemicalCategories ?? []) } },
|
|
181
|
+
{ name: 'created_at', value: { stringValue: this._serializeDate(chemical.createdAt) } },
|
|
182
|
+
{ name: 'updated_at', value: { stringValue: this._serializeDate(chemical.updatedAt) } }
|
|
183
|
+
];
|
|
184
|
+
|
|
185
|
+
return { sql: upsertSql, parameters };
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
async _executeChemicalUpsert(chemical) {
|
|
189
|
+
await this.connection.ensureConnected();
|
|
190
|
+
const { sql, parameters } = this._buildChemicalUpsertSql(chemical);
|
|
191
|
+
const queryResult = await this.connection.query(sql, parameters);
|
|
192
|
+
const row = queryResult.rows?.[0];
|
|
193
|
+
if (!row) return null;
|
|
194
|
+
return {
|
|
195
|
+
chemicalId: row.chemical_id,
|
|
196
|
+
sourceId: row.source_id
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
_buildDebugSql(chemical) {
|
|
201
|
+
const esc = (s) => `'${String(s ?? '').replace(/'/g, "''")}'`;
|
|
202
|
+
const escJson = (v) => `'${JSON.stringify(v ?? {}).replace(/'/g, "''")}'`;
|
|
203
|
+
const escArr = (arr) => {
|
|
204
|
+
if (!Array.isArray(arr) || arr.length === 0) return `ARRAY[]::text[]`;
|
|
205
|
+
return `ARRAY[${arr.map(s => esc(s)).join(', ')}]`;
|
|
206
|
+
};
|
|
207
|
+
const escDate = (d) => esc(d instanceof Date ? d.toISOString() : (d ?? new Date().toISOString()));
|
|
208
|
+
|
|
209
|
+
return [
|
|
210
|
+
`INSERT INTO chemicals`,
|
|
211
|
+
` (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)`,
|
|
212
|
+
`VALUES (`,
|
|
213
|
+
` ${esc(chemical.sourceId)},`,
|
|
214
|
+
` ${esc(chemical.chemicalName)},`,
|
|
215
|
+
` ${escJson(chemical.chemicalMeta)}::jsonb,`,
|
|
216
|
+
` ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
|
|
217
|
+
` ${escArr(chemical.chemicalSynonyms)},`,
|
|
218
|
+
` ${escArr(chemical.chemicalCategories)},`,
|
|
219
|
+
` ${escDate(chemical.createdAt)},`,
|
|
220
|
+
` ${escDate(chemical.updatedAt)}`,
|
|
221
|
+
`)`,
|
|
222
|
+
`ON CONFLICT (source_id) DO UPDATE SET`,
|
|
223
|
+
` chemical_name = ${esc(chemical.chemicalName)},`,
|
|
224
|
+
` chemical_meta = ${escJson(chemical.chemicalMeta)}::jsonb,`,
|
|
225
|
+
` chemical_identifiers = ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
|
|
226
|
+
` chemical_synonyms = ${escArr(chemical.chemicalSynonyms)},`,
|
|
227
|
+
` chemical_categories = ${escArr(chemical.chemicalCategories)},`,
|
|
228
|
+
` updated_at = NOW();`
|
|
229
|
+
].join('\n');
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
async bulkIndexFielded(documents) {
|
|
233
|
+
try {
|
|
234
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] Starting bulk index with ${documents?.length || 0} documents`);
|
|
235
|
+
|
|
236
|
+
if (!documents || documents.length === 0) {
|
|
237
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] No documents provided, returning empty result`);
|
|
238
|
+
return { indexed: 0, errors: [], results: [] };
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
const results = [];
|
|
242
|
+
const errors = [];
|
|
243
|
+
|
|
244
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] Database connection established`);
|
|
245
|
+
|
|
246
|
+
for (let i = 0; i < documents.length; i++) {
|
|
247
|
+
const doc = documents[i];
|
|
248
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] Processing document ${i}: source_id=${doc.source_id}, chemical_name=${doc.chemical_name}`);
|
|
249
|
+
|
|
250
|
+
const parseDate = (dateValue) => {
|
|
251
|
+
if (!dateValue) return new Date();
|
|
252
|
+
if (dateValue instanceof Date) return dateValue;
|
|
253
|
+
if (typeof dateValue === 'string') return new Date(dateValue);
|
|
254
|
+
return new Date();
|
|
255
|
+
};
|
|
256
|
+
|
|
257
|
+
const chemical = {
|
|
258
|
+
sourceId: doc.source_id || doc._id,
|
|
259
|
+
chemicalName: doc.chemical_name || doc.name,
|
|
260
|
+
chemicalMeta: doc.chemical_meta || {},
|
|
261
|
+
chemicalIdentifiers: doc.chemical_identifiers || {},
|
|
262
|
+
chemicalSynonyms: doc.chemical_synonyms || [],
|
|
263
|
+
chemicalCategories: doc.chemical_categories || [],
|
|
264
|
+
createdAt: parseDate(doc.created_at),
|
|
265
|
+
updatedAt: parseDate(doc.updated_at),
|
|
266
|
+
...(doc.imported_at && { importedAt: doc.imported_at }),
|
|
267
|
+
...(doc.chemical_id && { chemicalId: doc.chemical_id })
|
|
268
|
+
};
|
|
269
|
+
|
|
270
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] Prepared chemical object: sourceId=${chemical.sourceId}, chemicalName=${chemical.chemicalName}`);
|
|
271
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] DEBUG SQL for document ${i}:\n${this._buildDebugSql(chemical)}`);
|
|
272
|
+
|
|
273
|
+
const attemptUpsert = async () => {
|
|
274
|
+
const result = await this._executeChemicalUpsert(chemical);
|
|
275
|
+
return result ? [result] : [];
|
|
276
|
+
};
|
|
277
|
+
|
|
278
|
+
let lastError = null;
|
|
279
|
+
let retryCount = 0;
|
|
280
|
+
const failedAt = new Date();
|
|
281
|
+
|
|
282
|
+
try {
|
|
283
|
+
const [result] = await attemptUpsert();
|
|
284
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully: ${result?.chemicalId || 'no ID returned'}`);
|
|
285
|
+
results.push({ index: i, success: true, result });
|
|
286
|
+
continue;
|
|
287
|
+
} catch (firstErr) {
|
|
288
|
+
lastError = firstErr;
|
|
289
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} first attempt failed (${firstErr.message}), retrying once`);
|
|
290
|
+
|
|
291
|
+
try {
|
|
292
|
+
const [result] = await attemptUpsert();
|
|
293
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully on retry: ${result?.chemicalId || 'no ID returned'}`);
|
|
294
|
+
results.push({ index: i, success: true, result });
|
|
295
|
+
continue;
|
|
296
|
+
} catch (retryErr) {
|
|
297
|
+
lastError = retryErr;
|
|
298
|
+
retryCount = 1;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} failed after ${retryCount} local retries (source_id=${chemical.sourceId})`, lastError);
|
|
303
|
+
|
|
304
|
+
const { sql: failureSql, parameters: failureParams } = this._buildChemicalUpsertSql(chemical);
|
|
305
|
+
const queued = await this.sendSqlWriteFailure({
|
|
306
|
+
sql: failureSql,
|
|
307
|
+
parameters: failureParams,
|
|
308
|
+
error: lastError,
|
|
309
|
+
retryCount,
|
|
310
|
+
failedAt
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
if (queued) {
|
|
314
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} (source_id=${chemical.sourceId}) queued for repair via SQS`);
|
|
315
|
+
} else {
|
|
316
|
+
logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} (source_id=${chemical.sourceId}) failed and could not be queued — data loss risk`, lastError);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
results.push({ index: i, success: false, error: lastError.message, queued });
|
|
320
|
+
errors.push({ document: doc, error: lastError.message, queued });
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
const successCount = results.filter(r => r.success).length;
|
|
324
|
+
const queuedCount = results.filter(r => !r.success && r.queued).length;
|
|
325
|
+
logInfo('pegasus-sdk', `[bulkIndexFielded] Bulk index complete: ${successCount}/${documents.length} succeeded, ${queuedCount} queued for repair, ${errors.length - queuedCount} unhandled errors`);
|
|
326
|
+
|
|
327
|
+
return { indexed: successCount, errors, results };
|
|
328
|
+
} catch (error) {
|
|
329
|
+
logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFielded', error);
|
|
330
|
+
throw error;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
async bulkIndexFulltext(documents) {
|
|
335
|
+
try {
|
|
336
|
+
return { acknowledged: true, count: documents?.length || 0 };
|
|
337
|
+
} catch (error) {
|
|
338
|
+
logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFulltext', error);
|
|
339
|
+
throw error;
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
async bulkIndexSubstances(substances) {
|
|
344
|
+
try {
|
|
345
|
+
const documents = substances.map(substance => ({
|
|
346
|
+
source_id: substance.substance_id || substance.id,
|
|
347
|
+
chemical_name: substance.name || substance.substance_name,
|
|
348
|
+
chemical_meta: substance.meta || {},
|
|
349
|
+
chemical_identifiers: substance.identifiers || {},
|
|
350
|
+
chemical_synonyms: substance.synonyms || [],
|
|
351
|
+
chemical_categories: substance.categories || substance.substance_types || [],
|
|
352
|
+
created_at: substance.created_at,
|
|
353
|
+
updated_at: substance.updated_at,
|
|
354
|
+
imported_at: substance.imported_at
|
|
355
|
+
}));
|
|
356
|
+
|
|
357
|
+
return await this.bulkIndexFielded(documents);
|
|
358
|
+
} catch (error) {
|
|
359
|
+
logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexSubstances', error);
|
|
360
|
+
throw error;
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
async createChemical(chemical) {
|
|
365
|
+
try {
|
|
366
|
+
await this.connection.ensureConnected();
|
|
367
|
+
|
|
368
|
+
const transformedMeta = transformChemicalMeta(chemical.chemical_meta);
|
|
369
|
+
const transformedIdentifiers = transformChemicalIdentifiers(chemical.chemical_identifiers);
|
|
370
|
+
|
|
371
|
+
const columns = ['source_id', 'chemical_name', 'chemical_meta', 'chemical_identifiers', 'chemical_synonyms', 'chemical_categories', 'created_at', 'updated_at'];
|
|
372
|
+
const values = [':source_id', ':chemical_name', ':chemical_meta::jsonb', ':chemical_identifiers::jsonb', ':chemical_synonyms::text[]', ':chemical_categories::text[]', ':created_at::timestamp', ':updated_at::timestamp'];
|
|
373
|
+
const params = [
|
|
374
|
+
{ name: 'source_id', value: { stringValue: chemical.source_id } },
|
|
375
|
+
{ name: 'chemical_name', value: { stringValue: chemical.chemical_name } },
|
|
376
|
+
{ name: 'chemical_meta', value: { stringValue: JSON.stringify(transformedMeta) }, typeHint: 'JSON' },
|
|
377
|
+
{ name: 'chemical_identifiers', value: { stringValue: JSON.stringify(transformedIdentifiers) }, typeHint: 'JSON' },
|
|
378
|
+
{ name: 'chemical_synonyms', value: { stringValue: this._toPostgresArray(chemical.chemical_synonyms || []) } },
|
|
379
|
+
{ name: 'chemical_categories', value: { stringValue: this._toPostgresArray(chemical.chemical_categories || []) } },
|
|
380
|
+
{ name: 'created_at', value: { stringValue: this._serializeDate(chemical.created_at || new Date()) } },
|
|
381
|
+
{ name: 'updated_at', value: { stringValue: this._serializeDate(chemical.updated_at || new Date()) } }
|
|
382
|
+
];
|
|
383
|
+
|
|
384
|
+
if (chemical.imported_at) {
|
|
385
|
+
columns.push('imported_at');
|
|
386
|
+
values.push(':imported_at::timestamp');
|
|
387
|
+
params.push({ name: 'imported_at', value: { stringValue: this._serializeDate(chemical.imported_at) } });
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
if (chemical.chemical_id) {
|
|
391
|
+
columns.push('chemical_id');
|
|
392
|
+
values.push(':chemical_id');
|
|
393
|
+
params.push({ name: 'chemical_id', value: { stringValue: chemical.chemical_id } });
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
const sql = `INSERT INTO chemicals (${columns.join(', ')}) VALUES (${values.join(', ')}) RETURNING *`;
|
|
397
|
+
const result = await this.connection.query(sql, params);
|
|
398
|
+
return this._mapChemicalRow(result.rows?.[0]);
|
|
399
|
+
} catch (error) {
|
|
400
|
+
logError('pegasus-sdk', 'ChemicalsService', 'createChemical', error);
|
|
401
|
+
throw error;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
async updateChemical(chemicalId, updates) {
|
|
406
|
+
try {
|
|
407
|
+
await this.connection.ensureConnected();
|
|
408
|
+
|
|
409
|
+
const setClauses = [];
|
|
410
|
+
const params = [];
|
|
411
|
+
|
|
412
|
+
if (updates.chemical_name) {
|
|
413
|
+
setClauses.push('chemical_name = :chemical_name');
|
|
414
|
+
params.push({ name: 'chemical_name', value: { stringValue: updates.chemical_name } });
|
|
415
|
+
}
|
|
416
|
+
if (updates.chemical_meta) {
|
|
417
|
+
const transformedMeta = transformChemicalMeta(updates.chemical_meta);
|
|
418
|
+
setClauses.push('chemical_meta = :chemical_meta::jsonb');
|
|
419
|
+
params.push({ name: 'chemical_meta', value: { stringValue: JSON.stringify(transformedMeta) }, typeHint: 'JSON' });
|
|
420
|
+
}
|
|
421
|
+
if (updates.chemical_identifiers) {
|
|
422
|
+
const transformedIdentifiers = transformChemicalIdentifiers(updates.chemical_identifiers);
|
|
423
|
+
setClauses.push('chemical_identifiers = :chemical_identifiers::jsonb');
|
|
424
|
+
params.push({ name: 'chemical_identifiers', value: { stringValue: JSON.stringify(transformedIdentifiers) }, typeHint: 'JSON' });
|
|
425
|
+
}
|
|
426
|
+
if (updates.chemical_synonyms) {
|
|
427
|
+
setClauses.push('chemical_synonyms = :chemical_synonyms::text[]');
|
|
428
|
+
params.push({ name: 'chemical_synonyms', value: { stringValue: this._toPostgresArray(updates.chemical_synonyms) } });
|
|
429
|
+
}
|
|
430
|
+
if (updates.chemical_categories) {
|
|
431
|
+
setClauses.push('chemical_categories = :chemical_categories::text[]');
|
|
432
|
+
params.push({ name: 'chemical_categories', value: { stringValue: this._toPostgresArray(updates.chemical_categories) } });
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
setClauses.push('updated_at = :updated_at::timestamp');
|
|
436
|
+
params.push({ name: 'updated_at', value: { stringValue: this._serializeDate(new Date()) } });
|
|
437
|
+
|
|
438
|
+
params.push({ name: 'id', value: { stringValue: chemicalId } });
|
|
439
|
+
|
|
440
|
+
const sql = `UPDATE chemicals SET ${setClauses.join(', ')} WHERE chemical_id = :id::uuid RETURNING *`;
|
|
441
|
+
const result = await this.connection.query(sql, params);
|
|
442
|
+
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
443
|
+
} catch (error) {
|
|
444
|
+
logError('pegasus-sdk', 'ChemicalsService', 'updateChemical', error);
|
|
445
|
+
throw error;
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
async deleteChemical(chemicalId) {
|
|
450
|
+
try {
|
|
451
|
+
await this.connection.ensureConnected();
|
|
452
|
+
|
|
453
|
+
const sql = 'DELETE FROM chemicals WHERE chemical_id = :id::uuid RETURNING *';
|
|
454
|
+
const params = [{ name: 'id', value: { stringValue: chemicalId } }];
|
|
455
|
+
const result = await this.connection.query(sql, params);
|
|
456
|
+
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
457
|
+
} catch (error) {
|
|
458
|
+
logError('pegasus-sdk', 'ChemicalsService', 'deleteChemical', error);
|
|
459
|
+
throw error;
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
async deleteBySourceId(sourceId) {
|
|
464
|
+
try {
|
|
465
|
+
await this.connection.ensureConnected();
|
|
466
|
+
|
|
467
|
+
const sql = 'DELETE FROM chemicals WHERE source_id = :source_id RETURNING *';
|
|
468
|
+
const params = [{ name: 'source_id', value: { stringValue: sourceId } }];
|
|
469
|
+
const result = await this.connection.query(sql, params);
|
|
470
|
+
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
471
|
+
} catch (error) {
|
|
472
|
+
logError('pegasus-sdk', 'ChemicalsService', 'deleteBySourceId', error);
|
|
473
|
+
throw error;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
async deleteCollection(collectionName) {
|
|
478
|
+
try {
|
|
479
|
+
await this.connection.ensureConnected();
|
|
480
|
+
|
|
481
|
+
const sql = 'DELETE FROM chemicals WHERE :collection_name = ANY(chemical_categories) RETURNING *';
|
|
482
|
+
const params = [{ name: 'collection_name', value: { stringValue: collectionName } }];
|
|
483
|
+
const result = await this.connection.query(sql, params);
|
|
484
|
+
const deleted = result.rows.map(row => this._mapChemicalRow(row));
|
|
485
|
+
return { deletedCount: deleted.length, deleted };
|
|
486
|
+
} catch (error) {
|
|
487
|
+
logError('pegasus-sdk', 'ChemicalsService', 'deleteCollection', error);
|
|
488
|
+
throw error;
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
async updateCollectionProperty(collectionName, propertyPath, newValue) {
|
|
493
|
+
try {
|
|
494
|
+
await this.connection.ensureConnected();
|
|
495
|
+
|
|
496
|
+
const pathArray = propertyPath.split('.');
|
|
497
|
+
const sql = 'UPDATE chemicals SET chemical_meta = jsonb_set(chemical_meta, :path::text[], :value::jsonb), updated_at = :updated_at::timestamp WHERE :collection_name = ANY(chemical_categories) RETURNING *';
|
|
498
|
+
const params = [
|
|
499
|
+
{ name: 'path', value: { stringValue: JSON.stringify(pathArray) }, typeHint: 'JSON' },
|
|
500
|
+
{ name: 'value', value: { stringValue: JSON.stringify(newValue) }, typeHint: 'JSON' },
|
|
501
|
+
{ name: 'updated_at', value: { stringValue: this._serializeDate(new Date()) } },
|
|
502
|
+
{ name: 'collection_name', value: { stringValue: collectionName } }
|
|
503
|
+
];
|
|
504
|
+
|
|
505
|
+
const result = await this.connection.query(sql, params);
|
|
506
|
+
const updated = result.rows.map(row => this._mapChemicalRow(row));
|
|
507
|
+
return { updatedCount: updated.length, updated };
|
|
508
|
+
} catch (error) {
|
|
509
|
+
logError('pegasus-sdk', 'ChemicalsService', 'updateCollectionProperty', error);
|
|
510
|
+
throw error;
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
async bulkUpdateProperty(filter, propertyPath, newValue) {
|
|
515
|
+
try {
|
|
516
|
+
await this.connection.ensureConnected();
|
|
517
|
+
|
|
518
|
+
let whereClause = '1=1';
|
|
519
|
+
const params = [];
|
|
520
|
+
|
|
521
|
+
if (filter.chemicalIds && filter.chemicalIds.length > 0) {
|
|
522
|
+
const ids = filter.chemicalIds.map((id, i) => `:cid_${i}`).join(',');
|
|
523
|
+
whereClause = `chemical_id = ANY(ARRAY[${ids}]::uuid[])`;
|
|
524
|
+
filter.chemicalIds.forEach((id, i) => {
|
|
525
|
+
params.push({ name: `cid_${i}`, value: { stringValue: id } });
|
|
526
|
+
});
|
|
527
|
+
} else if (filter.sourceIds && filter.sourceIds.length > 0) {
|
|
528
|
+
const ids = filter.sourceIds.map((id, i) => `:sid_${i}`).join(',');
|
|
529
|
+
whereClause = `source_id = ANY(ARRAY[${ids}]::text[])`;
|
|
530
|
+
filter.sourceIds.forEach((id, i) => {
|
|
531
|
+
params.push({ name: `sid_${i}`, value: { stringValue: id } });
|
|
532
|
+
});
|
|
533
|
+
} else if (filter.category) {
|
|
534
|
+
whereClause = ':category = ANY(chemical_categories)';
|
|
535
|
+
params.push({ name: 'category', value: { stringValue: filter.category } });
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
const pathArray = propertyPath.split('.');
|
|
539
|
+
const sql = `UPDATE chemicals SET chemical_meta = jsonb_set(COALESCE(chemical_meta, '{}'), :path::text[], :value::jsonb), updated_at = :updated_at::timestamp WHERE ${whereClause} RETURNING *`;
|
|
540
|
+
|
|
541
|
+
params.push({ name: 'path', value: { stringValue: JSON.stringify(pathArray) }, typeHint: 'JSON' });
|
|
542
|
+
params.push({ name: 'value', value: { stringValue: JSON.stringify(newValue) }, typeHint: 'JSON' });
|
|
543
|
+
params.push({ name: 'updated_at', value: { stringValue: this._serializeDate(new Date()) } });
|
|
544
|
+
|
|
545
|
+
const result = await this.connection.query(sql, params);
|
|
546
|
+
const updated = result.rows.map(row => this._mapChemicalRow(row));
|
|
547
|
+
return { updatedCount: updated.length, updated };
|
|
548
|
+
} catch (error) {
|
|
549
|
+
logError('pegasus-sdk', 'ChemicalsService', 'bulkUpdateProperty', error);
|
|
550
|
+
throw error;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
async getChemicalById(chemicalId) {
|
|
555
|
+
try {
|
|
556
|
+
await this.connection.ensureConnected();
|
|
557
|
+
|
|
558
|
+
const sql = 'SELECT * FROM chemicals WHERE chemical_id = :id::uuid LIMIT 1';
|
|
559
|
+
const params = [{ name: 'id', value: { stringValue: chemicalId } }];
|
|
560
|
+
const result = await this.connection.query(sql, params);
|
|
561
|
+
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
562
|
+
} catch (error) {
|
|
563
|
+
logError('pegasus-sdk', 'ChemicalsService', 'getChemicalById', error);
|
|
564
|
+
throw error;
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
async getChemicalBySourceId(sourceId) {
|
|
569
|
+
try {
|
|
570
|
+
await this.connection.ensureConnected();
|
|
571
|
+
|
|
572
|
+
const sql = 'SELECT * FROM chemicals WHERE source_id = :source_id LIMIT 1';
|
|
573
|
+
const params = [{ name: 'source_id', value: { stringValue: sourceId } }];
|
|
574
|
+
const result = await this.connection.query(sql, params);
|
|
575
|
+
return this._mapChemicalRow(result.rows?.[0]) || null;
|
|
576
|
+
} catch (error) {
|
|
577
|
+
logError('pegasus-sdk', 'ChemicalsService', 'getChemicalBySourceId', error);
|
|
578
|
+
throw error;
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
async getChemicalsByCAS(casNumber) {
|
|
583
|
+
try {
|
|
584
|
+
await this.connection.ensureConnected();
|
|
585
|
+
|
|
586
|
+
const sql = "SELECT * FROM chemicals WHERE chemical_identifiers->>'CAS' = :cas OR chemical_identifiers->'CAS' ? :cas";
|
|
587
|
+
const params = [{ name: 'cas', value: { stringValue: casNumber } }];
|
|
588
|
+
const result = await this.connection.query(sql, params);
|
|
589
|
+
return result.rows.map(row => this._mapChemicalRow(row));
|
|
590
|
+
} catch (error) {
|
|
591
|
+
logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByCAS', error);
|
|
592
|
+
throw error;
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
async getChemicalsByIdentifier(identifierType, identifierValue) {
|
|
597
|
+
try {
|
|
598
|
+
if (!ALLOWED_IDENTIFIER_TYPES.has(identifierType)) {
|
|
599
|
+
throw new Error(`Invalid identifier type: ${identifierType}`);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
await this.connection.ensureConnected();
|
|
603
|
+
|
|
604
|
+
const sql = `SELECT * FROM chemicals WHERE chemical_identifiers->>'${identifierType}' = :value OR chemical_identifiers->'${identifierType}' ? :value`;
|
|
605
|
+
const params = [{ name: 'value', value: { stringValue: identifierValue } }];
|
|
606
|
+
const result = await this.connection.query(sql, params);
|
|
607
|
+
return result.rows.map(row => this._mapChemicalRow(row));
|
|
608
|
+
} catch (error) {
|
|
609
|
+
logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByIdentifier', error);
|
|
610
|
+
throw error;
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
async countByCollection(collectionName) {
|
|
615
|
+
try {
|
|
616
|
+
await this.connection.ensureConnected();
|
|
617
|
+
|
|
618
|
+
const sql = 'SELECT count(*)::int AS count FROM chemicals WHERE :collection_name = ANY(chemical_categories)';
|
|
619
|
+
const params = [{ name: 'collection_name', value: { stringValue: collectionName } }];
|
|
620
|
+
const result = await this.connection.query(sql, params);
|
|
621
|
+
return { count: result.rows[0]?.count ?? 0 };
|
|
622
|
+
} catch (error) {
|
|
623
|
+
logError('pegasus-sdk', 'ChemicalsService', 'countByCollection', error);
|
|
624
|
+
throw error;
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
async countByIdentifier(identifierValue) {
|
|
629
|
+
try {
|
|
630
|
+
await this.connection.ensureConnected();
|
|
631
|
+
|
|
632
|
+
const searchPattern = `%${escapeLikePattern(identifierValue)}%`;
|
|
633
|
+
const sql = 'SELECT count(*)::int AS count FROM chemicals WHERE chemical_identifiers::text LIKE :pattern';
|
|
634
|
+
const params = [{ name: 'pattern', value: { stringValue: searchPattern } }];
|
|
635
|
+
const result = await this.connection.query(sql, params);
|
|
636
|
+
return { count: result.rows[0]?.count ?? 0 };
|
|
637
|
+
} catch (error) {
|
|
638
|
+
logError('pegasus-sdk', 'ChemicalsService', 'countByIdentifier', error);
|
|
639
|
+
throw error;
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
async countByCAS(casNumber) {
|
|
644
|
+
try {
|
|
645
|
+
await this.connection.ensureConnected();
|
|
646
|
+
|
|
647
|
+
const sql = "SELECT count(*)::int AS count FROM chemicals WHERE chemical_identifiers->>'CAS' = :cas OR chemical_identifiers->'CAS' ? :cas";
|
|
648
|
+
const params = [{ name: 'cas', value: { stringValue: casNumber } }];
|
|
649
|
+
const result = await this.connection.query(sql, params);
|
|
650
|
+
return { count: result.rows[0]?.count ?? 0 };
|
|
651
|
+
} catch (error) {
|
|
652
|
+
logError('pegasus-sdk', 'ChemicalsService', 'countByCAS', error);
|
|
653
|
+
throw error;
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
async getTotalSynonymCount() {
|
|
658
|
+
try {
|
|
659
|
+
await this.connection.ensureConnected();
|
|
660
|
+
|
|
661
|
+
const sql = 'SELECT sum(array_length(chemical_synonyms, 1))::int AS count FROM chemicals';
|
|
662
|
+
const result = await this.connection.query(sql, []);
|
|
663
|
+
return { count: result.rows[0]?.count || 0 };
|
|
664
|
+
} catch (error) {
|
|
665
|
+
logError('pegasus-sdk', 'ChemicalsService', 'getTotalSynonymCount', error);
|
|
666
|
+
throw error;
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
async getSynonymCount(synonymTerm) {
|
|
671
|
+
try {
|
|
672
|
+
await this.connection.ensureConnected();
|
|
673
|
+
|
|
674
|
+
const sql = 'SELECT count(*)::int AS count FROM chemicals WHERE :term = ANY(chemical_synonyms)';
|
|
675
|
+
const params = [{ name: 'term', value: { stringValue: synonymTerm } }];
|
|
676
|
+
const result = await this.connection.query(sql, params);
|
|
677
|
+
return { count: result.rows[0]?.count ?? 0 };
|
|
678
|
+
} catch (error) {
|
|
679
|
+
logError('pegasus-sdk', 'ChemicalsService', 'getSynonymCount', error);
|
|
680
|
+
throw error;
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
async convertIdentifier(fromIdentifier, toIdentifierType) {
|
|
685
|
+
try {
|
|
686
|
+
await this.connection.ensureConnected();
|
|
687
|
+
|
|
688
|
+
const searchPattern = `%${escapeLikePattern(fromIdentifier)}%`;
|
|
689
|
+
const sql = 'SELECT * FROM chemicals WHERE chemical_identifiers::text LIKE :pattern LIMIT 1';
|
|
690
|
+
const params = [{ name: 'pattern', value: { stringValue: searchPattern } }];
|
|
691
|
+
const result = await this.connection.query(sql, params);
|
|
692
|
+
|
|
693
|
+
if (result.rows.length === 0) {
|
|
694
|
+
return null;
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
const chemical = this._mapChemicalRow(result.rows[0]);
|
|
698
|
+
const identifiers = chemical.chemicalIdentifiers || {};
|
|
699
|
+
const toIdentifier = identifiers[toIdentifierType];
|
|
700
|
+
|
|
701
|
+
return {
|
|
702
|
+
fromIdentifier,
|
|
703
|
+
toIdentifierType,
|
|
704
|
+
toIdentifier,
|
|
705
|
+
chemicalId: chemical.chemicalId,
|
|
706
|
+
chemicalName: chemical.chemicalName
|
|
707
|
+
};
|
|
708
|
+
} catch (error) {
|
|
709
|
+
logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifier', error);
|
|
710
|
+
throw error;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
async convertIdentifiersBatch(fromIdentifiers, toIdentifierType) {
|
|
715
|
+
try {
|
|
716
|
+
const conversions = await Promise.all(
|
|
717
|
+
fromIdentifiers.map(fromIdentifier =>
|
|
718
|
+
this.convertIdentifier(fromIdentifier, toIdentifierType)
|
|
719
|
+
)
|
|
720
|
+
);
|
|
721
|
+
|
|
722
|
+
return conversions.filter(conversion => conversion !== null);
|
|
723
|
+
} catch (error) {
|
|
724
|
+
logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifiersBatch', error);
|
|
725
|
+
throw error;
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
/**
|
|
730
|
+
* Search for chemicals by name using OpenSearch
|
|
731
|
+
* @param {string} searchTerm - Name to search for
|
|
732
|
+
* @param {number} limit - Maximum number of results (default: 10)
|
|
733
|
+
* @returns {Promise<Object>} Search results
|
|
734
|
+
*/
|
|
735
|
+
async searchByName(searchTerm, limit = 10) {
|
|
736
|
+
if (!searchTerm) {
|
|
737
|
+
return { results: [] };
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
try {
|
|
741
|
+
const result = await this.connection.invokeOpenSearch({
|
|
742
|
+
operation: 'search',
|
|
743
|
+
body: {
|
|
744
|
+
size: limit,
|
|
745
|
+
query: {
|
|
746
|
+
bool: {
|
|
747
|
+
should: [
|
|
748
|
+
{ term: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
|
|
749
|
+
{ prefix: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
|
|
750
|
+
{ term: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
|
|
751
|
+
{ prefix: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
|
|
752
|
+
{ match: { 'chemical_name': { query: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
753
|
+
{ match: { 'synonyms': { query: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
754
|
+
{ match_phrase_prefix: { 'chemical_name': { query: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
755
|
+
{ match_phrase_prefix: { 'synonyms': { query: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
756
|
+
],
|
|
757
|
+
minimum_should_match: 1
|
|
758
|
+
}
|
|
759
|
+
},
|
|
760
|
+
_source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
|
|
761
|
+
}
|
|
762
|
+
});
|
|
763
|
+
|
|
764
|
+
const hits = result?.hits?.hits || [];
|
|
765
|
+
const results = hits.map((hit) => ({
|
|
766
|
+
id: hit._source.postgres_id,
|
|
767
|
+
name: hit._source.chemical_name,
|
|
768
|
+
cas: hit._source.cas_numbers || [],
|
|
769
|
+
identifiers: hit._source.identifier_values || [],
|
|
770
|
+
synonyms: hit._source.synonyms || [],
|
|
771
|
+
score: hit._score
|
|
772
|
+
}));
|
|
773
|
+
|
|
774
|
+
return { results };
|
|
775
|
+
} catch (error) {
|
|
776
|
+
logError('pegasus-sdk', 'ChemicalsService', 'searchByName', error);
|
|
777
|
+
throw error;
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
/**
|
|
782
|
+
* Search for chemicals by synonym using OpenSearch
|
|
783
|
+
* @param {string} synonymTerm - Synonym to search for
|
|
784
|
+
* @param {number} limit - Maximum number of results (default: 10)
|
|
785
|
+
* @returns {Promise<Object>} Search results
|
|
786
|
+
*/
|
|
787
|
+
async searchBySynonym(synonymTerm, limit = 10) {
|
|
788
|
+
if (!synonymTerm) {
|
|
789
|
+
return { results: [] };
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
try {
|
|
793
|
+
const result = await this.connection.invokeOpenSearch({
|
|
794
|
+
operation: 'search',
|
|
795
|
+
body: {
|
|
796
|
+
size: limit,
|
|
797
|
+
query: {
|
|
798
|
+
bool: {
|
|
799
|
+
should: [
|
|
800
|
+
{ term: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
|
|
801
|
+
{ prefix: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
|
|
802
|
+
{ term: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
|
|
803
|
+
{ prefix: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } },
|
|
804
|
+
{ match: { 'synonyms': { query: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
|
|
805
|
+
{ match: { 'chemical_name': { query: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
|
|
806
|
+
{ match_phrase_prefix: { 'synonyms': { query: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
|
|
807
|
+
{ match_phrase_prefix: { 'chemical_name': { query: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
|
|
808
|
+
],
|
|
809
|
+
minimum_should_match: 1
|
|
810
|
+
}
|
|
811
|
+
},
|
|
812
|
+
_source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
|
|
813
|
+
}
|
|
814
|
+
});
|
|
815
|
+
|
|
816
|
+
const hits = result?.hits?.hits || [];
|
|
817
|
+
const results = hits.map((hit) => ({
|
|
818
|
+
id: hit._source.postgres_id,
|
|
819
|
+
name: hit._source.chemical_name,
|
|
820
|
+
cas: hit._source.cas_numbers || [],
|
|
821
|
+
identifiers: hit._source.identifier_values || [],
|
|
822
|
+
synonyms: hit._source.synonyms || [],
|
|
823
|
+
score: hit._score
|
|
824
|
+
}));
|
|
825
|
+
|
|
826
|
+
return { results };
|
|
827
|
+
} catch (error) {
|
|
828
|
+
logError('pegasus-sdk', 'ChemicalsService', 'searchBySynonym', error);
|
|
829
|
+
throw error;
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
async countAll() {
|
|
834
|
+
try {
|
|
835
|
+
await this.connection.ensureConnected();
|
|
836
|
+
const sql = 'SELECT count(*)::int AS count FROM chemicals';
|
|
837
|
+
const result = await this.connection.query(sql, []);
|
|
838
|
+
return { count: result.rows[0]?.count ?? 0 };
|
|
839
|
+
} catch (error) {
|
|
840
|
+
logError('pegasus-sdk', 'ChemicalsService', 'countAll', error);
|
|
841
|
+
throw error;
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
async findChemicalsWithoutDocuments(collectionName, searchTerm, pageSize = 100) {
|
|
846
|
+
try {
|
|
847
|
+
await this.connection.ensureConnected();
|
|
848
|
+
|
|
849
|
+
let whereFragments = [];
|
|
850
|
+
const params = [];
|
|
851
|
+
|
|
852
|
+
if (collectionName) {
|
|
853
|
+
whereFragments.push(':collection_name = ANY(chemical_categories)');
|
|
854
|
+
params.push({ name: 'collection_name', value: { stringValue: collectionName } });
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
if (searchTerm) {
|
|
858
|
+
const searchPattern = `%${escapeLikePattern(searchTerm)}%`;
|
|
859
|
+
whereFragments.push('chemical_name ILIKE :search_term');
|
|
860
|
+
params.push({ name: 'search_term', value: { stringValue: searchPattern } });
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
const whereClause = whereFragments.length > 0 ? 'WHERE ' + whereFragments.join(' AND ') : '';
|
|
864
|
+
params.push({ name: 'page_size', value: { longValue: pageSize } });
|
|
865
|
+
|
|
866
|
+
const sql = `SELECT * FROM chemicals ${whereClause} LIMIT :page_size`;
|
|
867
|
+
const result = await this.connection.query(sql, params);
|
|
868
|
+
return result.rows.map(row => this._mapChemicalRow(row));
|
|
869
|
+
} catch (error) {
|
|
870
|
+
logError('pegasus-sdk', 'ChemicalsService', 'findChemicalsWithoutDocuments', error);
|
|
871
|
+
throw error;
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
async countChemicalsWithoutDocuments(collectionName) {
|
|
876
|
+
try {
|
|
877
|
+
await this.connection.ensureConnected();
|
|
878
|
+
|
|
879
|
+
let sql = 'SELECT count(*)::int AS count FROM chemicals';
|
|
880
|
+
const params = [];
|
|
881
|
+
if (collectionName) {
|
|
882
|
+
sql += ' WHERE :collection_name = ANY(chemical_categories)';
|
|
883
|
+
params.push({ name: 'collection_name', value: { stringValue: collectionName } });
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
const result = await this.connection.query(sql, params);
|
|
887
|
+
return { count: result.rows[0]?.count ?? 0 };
|
|
888
|
+
} catch (error) {
|
|
889
|
+
logError('pegasus-sdk', 'ChemicalsService', 'countChemicalsWithoutDocuments', error);
|
|
890
|
+
throw error;
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
_buildEsHandlers() {
|
|
895
|
+
return {
|
|
896
|
+
index: async (params) => {
|
|
897
|
+
const chemical = params.body;
|
|
898
|
+
const result = await this.createChemical(chemical);
|
|
899
|
+
|
|
900
|
+
return {
|
|
901
|
+
_index: params.index,
|
|
902
|
+
_id: result.chemicalId,
|
|
903
|
+
_version: 1,
|
|
904
|
+
result: 'created',
|
|
905
|
+
_source: result
|
|
906
|
+
};
|
|
907
|
+
},
|
|
908
|
+
|
|
909
|
+
bulk: async (params) => {
|
|
910
|
+
const operations = params.body || params.operations;
|
|
911
|
+
|
|
912
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Starting bulk operation with ${operations?.length || 0} total operations`);
|
|
913
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Params index: ${params.index}`);
|
|
914
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Operations array type: ${Array.isArray(operations) ? 'array' : typeof operations}`);
|
|
915
|
+
|
|
916
|
+
const cdiDocuments = [];
|
|
917
|
+
let cdiOpCount = 0;
|
|
918
|
+
let otherOpCount = 0;
|
|
919
|
+
|
|
920
|
+
for (let i = 0; i < operations.length; i++) {
|
|
921
|
+
const op = operations[i];
|
|
922
|
+
const isIndexOp = !!(op.index || op.create);
|
|
923
|
+
const indexName = op.index?._index || op.create?._index || op.delete?._index || op.update?._index;
|
|
924
|
+
|
|
925
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Op[${i}]: action=${Object.keys(op)[0] || 'unknown'}, index=${indexName}`);
|
|
926
|
+
|
|
927
|
+
if ((op.index || op.create) &&
|
|
928
|
+
(op.index?._index === 'chemical_data_index' || op.create?._index === 'chemical_data_index')) {
|
|
929
|
+
const doc = operations[i + 1];
|
|
930
|
+
const sourceId = op.index?._id || op.create?._id;
|
|
931
|
+
|
|
932
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Found CDI entry: sourceId=${sourceId}, hasDoc=${!!doc}`);
|
|
933
|
+
|
|
934
|
+
if (doc && sourceId) {
|
|
935
|
+
const cdiDoc = {
|
|
936
|
+
source_id: sourceId,
|
|
937
|
+
chemical_name: doc.chemical_primary_name || (doc.chemical_names && doc.chemical_names[0]) || null,
|
|
938
|
+
chemical_meta: doc.chemical_meta || {},
|
|
939
|
+
chemical_identifiers: doc.chemical_identifiers || {},
|
|
940
|
+
chemical_synonyms: doc.chemical_synonyms || [],
|
|
941
|
+
chemical_categories: doc.chemical_categories || [],
|
|
942
|
+
created_at: doc.chemical_created_at ? (typeof doc.chemical_created_at === 'string' ? new Date(doc.chemical_created_at) : doc.chemical_created_at) : new Date(),
|
|
943
|
+
updated_at: doc.chemical_updated_at ? (typeof doc.chemical_updated_at === 'string' ? new Date(doc.chemical_updated_at) : doc.chemical_updated_at) : new Date()
|
|
944
|
+
};
|
|
945
|
+
cdiDocuments.push(cdiDoc);
|
|
946
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Extracted CDI doc: ${JSON.stringify({ source_id: cdiDoc.source_id, chemical_name: cdiDoc.chemical_name })}`);
|
|
947
|
+
i++;
|
|
948
|
+
cdiOpCount++;
|
|
949
|
+
} else {
|
|
950
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] CDI entry incomplete: sourceId=${sourceId}, doc=${!!doc}`);
|
|
951
|
+
}
|
|
952
|
+
} else {
|
|
953
|
+
otherOpCount++;
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Scan complete: ${cdiOpCount} CDI docs found, ${otherOpCount} other operations skipped`);
|
|
958
|
+
|
|
959
|
+
if (cdiDocuments.length === 0) {
|
|
960
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] No CDI documents to index, returning empty no-op response`);
|
|
961
|
+
return { took: 0, errors: false, items: [] };
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] Calling bulkIndexFielded with ${cdiDocuments.length} CDI documents`);
|
|
965
|
+
|
|
966
|
+
try {
|
|
967
|
+
const result = await this.bulkIndexFielded(cdiDocuments);
|
|
968
|
+
logInfo('pegasus-sdk', `[ChemicalsService.bulk] bulkIndexFielded returned: indexed=${result.indexed}, errors=${result.errors.length}`);
|
|
969
|
+
|
|
970
|
+
if (result.errors.length > 0) {
|
|
971
|
+
logError('pegasus-sdk', 'ChemicalsService.bulk', 'Errors during bulk indexing', result.errors);
|
|
972
|
+
}
|
|
973
|
+
|
|
974
|
+
return {
|
|
975
|
+
took: 1,
|
|
976
|
+
errors: result.errors.length > 0,
|
|
977
|
+
items: result.results.map((res, idx) => ({
|
|
978
|
+
index: {
|
|
979
|
+
_index: 'chemical_data_index',
|
|
980
|
+
_id: cdiDocuments[idx].source_id,
|
|
981
|
+
status: res.success ? 200 : 400,
|
|
982
|
+
result: res.success ? 'created' : 'error',
|
|
983
|
+
...(res.success ? {} : { error: { type: 'mapper_parsing_exception', reason: res.error } })
|
|
984
|
+
}
|
|
985
|
+
}))
|
|
986
|
+
};
|
|
987
|
+
} catch (error) {
|
|
988
|
+
logError('pegasus-sdk', 'ChemicalsService.bulk', 'Fatal error during bulk indexing', error);
|
|
989
|
+
throw error;
|
|
990
|
+
}
|
|
991
|
+
},
|
|
992
|
+
|
|
993
|
+
get: async (params) => {
|
|
994
|
+
const result = await this.getChemicalBySourceId(params.id);
|
|
995
|
+
|
|
996
|
+
if (!result) {
|
|
997
|
+
return {
|
|
998
|
+
_index: params.index,
|
|
999
|
+
_id: params.id,
|
|
1000
|
+
found: false
|
|
1001
|
+
};
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
return {
|
|
1005
|
+
_index: params.index,
|
|
1006
|
+
_id: params.id,
|
|
1007
|
+
_version: 1,
|
|
1008
|
+
found: true,
|
|
1009
|
+
_source: result
|
|
1010
|
+
};
|
|
1011
|
+
},
|
|
1012
|
+
|
|
1013
|
+
update: async (params) => {
|
|
1014
|
+
const result = await this.updateChemical(params.id, params.body);
|
|
1015
|
+
|
|
1016
|
+
return {
|
|
1017
|
+
_index: params.index,
|
|
1018
|
+
_id: params.id,
|
|
1019
|
+
_version: 2,
|
|
1020
|
+
result: result ? 'updated' : 'noop',
|
|
1021
|
+
_source: result
|
|
1022
|
+
};
|
|
1023
|
+
},
|
|
1024
|
+
|
|
1025
|
+
delete: async (params) => {
|
|
1026
|
+
if (params.index === 'synonym_lookup_index') {
|
|
1027
|
+
return { _index: params.index, _id: params.id, result: 'not_found' };
|
|
1028
|
+
}
|
|
1029
|
+
const result = await this.deleteBySourceId(params.id);
|
|
1030
|
+
|
|
1031
|
+
return {
|
|
1032
|
+
_index: params.index,
|
|
1033
|
+
_id: params.id,
|
|
1034
|
+
result: result ? 'deleted' : 'not_found'
|
|
1035
|
+
};
|
|
1036
|
+
},
|
|
1037
|
+
|
|
1038
|
+
deleteByQuery: async (params) => {
|
|
1039
|
+
const sourceId = params.body?.query?.term?.chemical_set_identifier
|
|
1040
|
+
|| params.body?.query?.term?.source_id;
|
|
1041
|
+
if (!sourceId) {
|
|
1042
|
+
return { deleted: 0, failures: [] };
|
|
1043
|
+
}
|
|
1044
|
+
const result = await this.deleteBySourceId(sourceId);
|
|
1045
|
+
return {
|
|
1046
|
+
deleted: result ? 1 : 0,
|
|
1047
|
+
failures: []
|
|
1048
|
+
};
|
|
1049
|
+
},
|
|
1050
|
+
|
|
1051
|
+
search: async (params) => {
|
|
1052
|
+
let searchTerm = '';
|
|
1053
|
+
let limit = params.body?.size || 10;
|
|
1054
|
+
|
|
1055
|
+
if (params.index === 'synonym_lookup_index') {
|
|
1056
|
+
const query = params.body?.query;
|
|
1057
|
+
searchTerm = query?.match?.chemical_name ||
|
|
1058
|
+
query?.term?.chemical_name ||
|
|
1059
|
+
query?.query_string?.query || '';
|
|
1060
|
+
const searchResults = await this.searchBySynonym(searchTerm, limit);
|
|
1061
|
+
|
|
1062
|
+
return {
|
|
1063
|
+
took: 1,
|
|
1064
|
+
timed_out: false,
|
|
1065
|
+
_shards: {
|
|
1066
|
+
total: 1,
|
|
1067
|
+
successful: 1,
|
|
1068
|
+
skipped: 0,
|
|
1069
|
+
failed: 0
|
|
1070
|
+
},
|
|
1071
|
+
hits: {
|
|
1072
|
+
total: {
|
|
1073
|
+
value: searchResults.results.length,
|
|
1074
|
+
relation: 'eq'
|
|
1075
|
+
},
|
|
1076
|
+
max_score: searchResults.results[0]?.score || 0,
|
|
1077
|
+
hits: searchResults.results.map(result => ({
|
|
1078
|
+
_index: params.index,
|
|
1079
|
+
_id: result.id,
|
|
1080
|
+
_score: result.score,
|
|
1081
|
+
_source: {
|
|
1082
|
+
postgres_id: result.id,
|
|
1083
|
+
chemical_name: result.name,
|
|
1084
|
+
cas_numbers: result.cas,
|
|
1085
|
+
identifier_values: result.identifiers,
|
|
1086
|
+
synonyms: result.synonyms
|
|
1087
|
+
}
|
|
1088
|
+
}))
|
|
1089
|
+
}
|
|
1090
|
+
};
|
|
1091
|
+
} else {
|
|
1092
|
+
const query = params.body?.query;
|
|
1093
|
+
searchTerm = query?.match?.chemical_name ||
|
|
1094
|
+
query?.term?.chemical_name ||
|
|
1095
|
+
query?.query_string?.query || '';
|
|
1096
|
+
const searchResults = await this.searchByName(searchTerm, limit);
|
|
1097
|
+
|
|
1098
|
+
return {
|
|
1099
|
+
took: 1,
|
|
1100
|
+
timed_out: false,
|
|
1101
|
+
_shards: {
|
|
1102
|
+
total: 1,
|
|
1103
|
+
successful: 1,
|
|
1104
|
+
skipped: 0,
|
|
1105
|
+
failed: 0
|
|
1106
|
+
},
|
|
1107
|
+
hits: {
|
|
1108
|
+
total: {
|
|
1109
|
+
value: searchResults.results.length,
|
|
1110
|
+
relation: 'eq'
|
|
1111
|
+
},
|
|
1112
|
+
max_score: searchResults.results[0]?.score || 0,
|
|
1113
|
+
hits: searchResults.results.map(result => ({
|
|
1114
|
+
_index: params.index,
|
|
1115
|
+
_id: result.id,
|
|
1116
|
+
_score: result.score,
|
|
1117
|
+
_source: {
|
|
1118
|
+
postgres_id: result.id,
|
|
1119
|
+
chemical_name: result.name,
|
|
1120
|
+
cas_numbers: result.cas,
|
|
1121
|
+
identifier_values: result.identifiers,
|
|
1122
|
+
synonyms: result.synonyms
|
|
1123
|
+
}
|
|
1124
|
+
}))
|
|
1125
|
+
}
|
|
1126
|
+
};
|
|
1127
|
+
}
|
|
1128
|
+
},
|
|
1129
|
+
|
|
1130
|
+
count: async (params) => {
|
|
1131
|
+
if (params.index === 'synonym_lookup_index') {
|
|
1132
|
+
return await this.getTotalSynonymCount();
|
|
1133
|
+
}
|
|
1134
|
+
return await this.countAll();
|
|
1135
|
+
}
|
|
1136
|
+
};
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
registerElasticsearchHandlers(elasticsearchService) {
|
|
1140
|
+
const configurablePatterns = this.connection.config.indexRoutes?.chemicals || ['chemicals*'];
|
|
1141
|
+
const legacyPatterns = ['synonym_lookup_index', 'chemical_data_index', 'chemical_converter_index'];
|
|
1142
|
+
const allPatterns = [...new Set([...configurablePatterns, ...legacyPatterns])];
|
|
1143
|
+
const handlers = this._buildEsHandlers();
|
|
1144
|
+
allPatterns.forEach(pattern => {
|
|
1145
|
+
elasticsearchService.registerIndexRoute(pattern, handlers);
|
|
1146
|
+
});
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
1150
|
module.exports = ChemicalsService;
|